From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- gfx/wr/swgl/src/blend.h | 864 ++++++++++++ gfx/wr/swgl/src/composite.h | 1386 ++++++++++++++++++ gfx/wr/swgl/src/gl.cc | 2851 +++++++++++++++++++++++++++++++++++++ gfx/wr/swgl/src/gl_defs.h | 220 +++ gfx/wr/swgl/src/glsl.h | 3119 +++++++++++++++++++++++++++++++++++++++++ gfx/wr/swgl/src/lib.rs | 12 + gfx/wr/swgl/src/program.h | 186 +++ gfx/wr/swgl/src/rasterize.h | 1680 ++++++++++++++++++++++ gfx/wr/swgl/src/swgl_ext.h | 1924 +++++++++++++++++++++++++ gfx/wr/swgl/src/swgl_fns.rs | 2489 ++++++++++++++++++++++++++++++++ gfx/wr/swgl/src/texture.h | 1310 +++++++++++++++++ gfx/wr/swgl/src/vector_type.h | 563 ++++++++ 12 files changed, 16604 insertions(+) create mode 100644 gfx/wr/swgl/src/blend.h create mode 100644 gfx/wr/swgl/src/composite.h create mode 100644 gfx/wr/swgl/src/gl.cc create mode 100644 gfx/wr/swgl/src/gl_defs.h create mode 100644 gfx/wr/swgl/src/glsl.h create mode 100644 gfx/wr/swgl/src/lib.rs create mode 100644 gfx/wr/swgl/src/program.h create mode 100644 gfx/wr/swgl/src/rasterize.h create mode 100644 gfx/wr/swgl/src/swgl_ext.h create mode 100644 gfx/wr/swgl/src/swgl_fns.rs create mode 100644 gfx/wr/swgl/src/texture.h create mode 100644 gfx/wr/swgl/src/vector_type.h (limited to 'gfx/wr/swgl/src') diff --git a/gfx/wr/swgl/src/blend.h b/gfx/wr/swgl/src/blend.h new file mode 100644 index 0000000000..af29fb4c09 --- /dev/null +++ b/gfx/wr/swgl/src/blend.h @@ -0,0 +1,864 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) { +#if USE_SSE2 + return _mm_packs_epi32(a, b); +#elif USE_NEON + return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b)); +#else + return CONVERT(combine(a, b), HalfRGBA8); +#endif +} + +static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v, + float scale = 255.0f) { + ivec4 i = round_pixel(v, scale); + HalfRGBA8 xz = packRGBA8(i.z, i.x); + HalfRGBA8 yw = packRGBA8(i.y, i.w); + HalfRGBA8 xyzwl = zipLow(xz, yw); + HalfRGBA8 xyzwh = zipHigh(xz, yw); + HalfRGBA8 lo = zip2Low(xyzwl, xyzwh); + HalfRGBA8 hi = zip2High(xyzwl, xyzwh); + return combine(lo, hi); +} + +static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(Float alpha, + float scale = 255.0f) { + I32 i = round_pixel(alpha, scale); + HalfRGBA8 c = packRGBA8(i, i); + c = zipLow(c, c); + return zip(c, c); +} + +static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(float alpha, + float scale = 255.0f) { + I32 i = round_pixel(alpha, scale); + return repeat2(packRGBA8(i, i)); +} + +UNUSED static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v, + float scale = 255.0f) { + I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}, scale); + return repeat2(packRGBA8(i, i)); +} + +static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() { + return pack_pixels_RGBA8(fragment_shader->gl_FragColor); +} + +static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v, + float scale = 255.0f) { + ivec4 i = round_pixel(bit_cast(v), scale); + return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w)); +} + +static ALWAYS_INLINE WideR8 packR8(I32 a) { +#if USE_SSE2 + return lowHalf(bit_cast>(_mm_packs_epi32(a, a))); +#elif USE_NEON + return vqmovun_s32(a); +#else + return CONVERT(a, WideR8); +#endif +} + +static ALWAYS_INLINE WideR8 pack_pixels_R8(Float c, float scale = 255.0f) { + return packR8(round_pixel(c, scale)); +} + +static ALWAYS_INLINE WideR8 pack_pixels_R8() { + return pack_pixels_R8(fragment_shader->gl_FragColor.x); +} + +// Load a partial span > 0 and < 4 pixels. +template +static ALWAYS_INLINE V partial_load_span(const P* src, int span) { + return bit_cast( + (span >= 2 + ? combine(unaligned_load>(src), + V2

{span > 2 ? unaligned_load

(src + 2) : P(0), 0}) + : V4

{unaligned_load

(src), 0, 0, 0})); +} + +// Store a partial span > 0 and < 4 pixels. +template +static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) { + auto pixels = bit_cast>(src); + if (span >= 2) { + unaligned_store(dst, lowHalf(pixels)); + if (span > 2) { + unaligned_store(dst + 2, pixels.z); + } + } else { + unaligned_store(dst, pixels.x); + } +} + +// Dispatcher that chooses when to load a full or partial span +template +static ALWAYS_INLINE V load_span(const P* src, int span) { + if (span >= 4) { + return unaligned_load(src); + } else { + return partial_load_span(src, span); + } +} + +// Dispatcher that chooses when to store a full or partial span +template +static ALWAYS_INLINE void store_span(P* dst, V src, int span) { + if (span >= 4) { + unaligned_store(dst, src); + } else { + partial_store_span(dst, src, span); + } +} + +template +static ALWAYS_INLINE T muldiv256(T x, T y) { + return (x * y) >> 8; +} + +// (x*y + x) >> 8, cheap approximation of (x*y) / 255 +template +static ALWAYS_INLINE T muldiv255(T x, T y) { + return (x * y + x) >> 8; +} + +template +static ALWAYS_INLINE WideRGBA8 pack_span(uint32_t*, const V& v, + float scale = 255.0f) { + return pack_pixels_RGBA8(v, scale); +} + +template +static ALWAYS_INLINE WideR8 pack_span(uint8_t*, C c, float scale = 255.0f) { + return pack_pixels_R8(c, scale); +} + +// Helper functions to apply a color modulus when available. +struct NoColor {}; + +template +static ALWAYS_INLINE P applyColor(P src, NoColor) { + return src; +} + +struct InvertColor {}; + +template +static ALWAYS_INLINE P applyColor(P src, InvertColor) { + return 255 - src; +} + +template +static ALWAYS_INLINE P applyColor(P src, P color) { + return muldiv255(color, src); +} + +static ALWAYS_INLINE WideRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) { + return applyColor(unpack(src), color); +} + +template +static ALWAYS_INLINE auto packColor(P* buf, C color) { + return pack_span(buf, color, 255.0f); +} + +template +static ALWAYS_INLINE NoColor packColor(UNUSED P* buf, NoColor noColor) { + return noColor; +} + +template +static ALWAYS_INLINE InvertColor packColor(UNUSED P* buf, + InvertColor invertColor) { + return invertColor; +} + +// Single argument variation that takes an explicit destination buffer type. +template +static ALWAYS_INLINE auto packColor(C color) { + // Just pass in a typed null pointer, as the pack routines never use the + // pointer's value, just its type. + return packColor((P*)0, color); +} + +// Byte-wise addition for when x or y is a signed 8-bit value stored in the +// low byte of a larger type T only with zeroed-out high bits, where T is +// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used +// upon signed operands, using up all the precision in a 16 bit integer, and +// potentially losing the sign bit in the last >> 8 shift. Due to the +// properties of two's complement arithmetic, even though we've discarded the +// sign bit, we can still represent a negative number under addition (without +// requiring any extra sign bits), just that any negative number will behave +// like a large unsigned number under addition, generating a single carry bit +// on overflow that we need to discard. Thus, just doing a byte-wise add will +// overflow without the troublesome carry, giving us only the remaining 8 low +// bits we actually need while keeping the high bits at zero. +template +static ALWAYS_INLINE T addlow(T x, T y) { + typedef VectorType bytes; + return bit_cast(bit_cast(x) + bit_cast(y)); +} + +// Replace color components of each pixel with the pixel's alpha values. +template +static ALWAYS_INLINE T alphas(T c) { + return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15); +} + +// Replace the alpha values of the first vector with alpha values from the +// second, while leaving the color components unmodified. +template +static ALWAYS_INLINE T set_alphas(T c, T a) { + return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31); +} + +// Miscellaneous helper functions for working with packed RGBA8 data. +static ALWAYS_INLINE HalfRGBA8 if_then_else(V8 c, HalfRGBA8 t, + HalfRGBA8 e) { + return bit_cast((c & t) | (~c & e)); +} + +template +static ALWAYS_INLINE VectorType if_then_else(VectorType c, + VectorType t, + VectorType e) { + return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)), + if_then_else(highHalf(c), highHalf(t), highHalf(e))); +} + +static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) { +#if USE_SSE2 + return bit_cast( + _mm_min_epi16(bit_cast>(x), bit_cast>(y))); +#elif USE_NEON + return vminq_u16(x, y); +#else + return if_then_else(x < y, x, y); +#endif +} + +template +static ALWAYS_INLINE VectorType min(VectorType x, + VectorType y) { + return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y))); +} + +static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) { +#if USE_SSE2 + return bit_cast( + _mm_max_epi16(bit_cast>(x), bit_cast>(y))); +#elif USE_NEON + return vmaxq_u16(x, y); +#else + return if_then_else(x > y, x, y); +#endif +} + +template +static ALWAYS_INLINE VectorType max(VectorType x, + VectorType y) { + return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y))); +} + +template +static ALWAYS_INLINE VectorType recip(VectorType v) { + return combine(recip(lowHalf(v)), recip(highHalf(v))); +} + +// Helper to get the reciprocal if the value is non-zero, or otherwise default +// to the supplied fallback value. +template +static ALWAYS_INLINE V recip_or(V v, float f) { + return if_then_else(v != V(0.0f), recip(v), V(f)); +} + +template +static ALWAYS_INLINE VectorType inversesqrt(VectorType v) { + return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v))); +} + +// Extract the alpha components so that we can cheaply calculate the reciprocal +// on a single SIMD register. Then multiply the duplicated alpha reciprocal with +// the pixel data. 0 alpha is treated as transparent black. +static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) { + Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f); + return v * a.xxxxyyyyzzzzwwww; +} + +// Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to +// RGBA to unpack. +static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) { + return bit_cast( + SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15)); +} + +// The following lum/sat functions mostly follow the KHR_blend_equation_advanced +// specification but are rearranged to work on premultiplied data. +static ALWAYS_INLINE Float lumv3(vec3 v) { + return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f; +} + +static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); } + +static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); } + +static inline vec3 clip_color(vec3 v, Float lum, Float alpha) { + Float mincol = max(-minv3(v), lum); + Float maxcol = max(maxv3(v), alpha - lum); + return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f)); +} + +static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) { + return clip_color(base - lumv3(base), lumv3(ref), alpha); +} + +static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) { + vec3 diff = base - minv3(base); + Float sbase = maxv3(diff); + Float ssat = maxv3(sref) - minv3(sref); + // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale + // to black, as per specification. + return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha); +} + +// Flags the reflect the current blend-stage clipping to be applied. +enum SWGLClipFlag { + SWGL_CLIP_FLAG_MASK = 1 << 0, + SWGL_CLIP_FLAG_AA = 1 << 1, + SWGL_CLIP_FLAG_BLEND_OVERRIDE = 1 << 2, +}; +static int swgl_ClipFlags = 0; +static BlendKey swgl_BlendOverride = BLEND_KEY_NONE; +static WideRGBA8 swgl_BlendColorRGBA8 = {0}; +static WideRGBA8 swgl_BlendAlphaRGBA8 = {0}; + +// A pointer into the color buffer for the start of the span. +static void* swgl_SpanBuf = nullptr; +// A pointer into the clip mask for the start of the span. +static uint8_t* swgl_ClipMaskBuf = nullptr; + +static ALWAYS_INLINE WideR8 expand_mask(UNUSED uint8_t* buf, WideR8 mask) { + return mask; +} +static ALWAYS_INLINE WideRGBA8 expand_mask(UNUSED uint32_t* buf, WideR8 mask) { + WideRG8 maskRG = zip(mask, mask); + return zip(maskRG, maskRG); +} + +// Loads a chunk of clip masks. The current pointer into the color buffer is +// used to reconstruct the relative position within the span. From there, the +// pointer into the clip mask can be generated from the start of the clip mask +// span. +template +static ALWAYS_INLINE uint8_t* get_clip_mask(P* buf) { + return &swgl_ClipMaskBuf[buf - (P*)swgl_SpanBuf]; +} + +template +static ALWAYS_INLINE auto load_clip_mask(P* buf, int span) + -> decltype(expand_mask(buf, 0)) { + return expand_mask(buf, + unpack(load_span(get_clip_mask(buf), span))); +} + +// Temporarily removes masking from the blend stage, assuming the caller will +// handle it. +static ALWAYS_INLINE void override_clip_mask() { + blend_key = BlendKey(blend_key - MASK_BLEND_KEY_NONE); +} + +// Restores masking to the blend stage, assuming it was previously overridden. +static ALWAYS_INLINE void restore_clip_mask() { + blend_key = BlendKey(MASK_BLEND_KEY_NONE + blend_key); +} + +// A pointer to the start of the opaque destination region of the span for AA. +static const uint8_t* swgl_OpaqueStart = nullptr; +// The size, in bytes, of the opaque region. +static uint32_t swgl_OpaqueSize = 0; +// AA coverage distance offsets for the left and right edges. +static Float swgl_LeftAADist = 0.0f; +static Float swgl_RightAADist = 0.0f; +// AA coverage slope values used for accumulating coverage for each step. +static Float swgl_AASlope = 0.0f; + +// Get the amount of pixels we need to process before the start of the opaque +// region. +template +static ALWAYS_INLINE int get_aa_opaque_start(P* buf) { + return max(int((P*)swgl_OpaqueStart - buf), 0); +} + +// Assuming we are already in the opaque part of the span, return the remaining +// size of the opaque part. +template +static ALWAYS_INLINE int get_aa_opaque_size(P* buf) { + return max(int((P*)&swgl_OpaqueStart[swgl_OpaqueSize] - buf), 0); +} + +// Temporarily removes anti-aliasing from the blend stage, assuming the caller +// will handle it. +static ALWAYS_INLINE void override_aa() { + blend_key = BlendKey(blend_key - AA_BLEND_KEY_NONE); +} + +// Restores anti-aliasing to the blend stage, assuming it was previously +// overridden. +static ALWAYS_INLINE void restore_aa() { + blend_key = BlendKey(AA_BLEND_KEY_NONE + blend_key); +} + +static PREFER_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst, + WideRGBA8 src, int span = 4) { + WideRGBA8 dst = unpack(pdst); + const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF, + 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0, + 0xFFFF, 0xFFFF, 0xFFFF, 0}; + const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF, + 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF}; + const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255, + 0, 0, 0, 255, 0, 0, 0, 255}; + + // clang-format off + // Computes AA for the given pixel based on the offset of the pixel within + // destination row. Given the initial coverage offsets for the left and right + // edges, the offset is scaled by the slope and accumulated to find the + // minimum coverage value for the pixel. A final weight is generated that + // can be used to scale the source pixel. +#define DO_AA(format, body) \ + do { \ + int offset = int((const uint8_t*)buf - swgl_OpaqueStart); \ + if (uint32_t(offset) >= swgl_OpaqueSize) { \ + Float delta = swgl_AASlope * float(offset); \ + Float dist = clamp(min(swgl_LeftAADist + delta.x, \ + swgl_RightAADist + delta.y), \ + 0.0f, 256.0f); \ + auto aa = pack_pixels_##format(dist, 1.0f); \ + body; \ + } \ + } while (0) + + // Each blend case is preceded by the MASK_ variant. The MASK_ case first + // loads the mask values and multiplies the source value by them. After, it + // falls through to the normal blending case using the masked source. The + // AA_ variations may further precede the blend cases, in which case the + // source value is further modified before use. +#define BLEND_CASE_KEY(key) \ + case AA_##key: \ + DO_AA(RGBA8, src = muldiv256(src, aa)); \ + goto key; \ + case AA_MASK_##key: \ + DO_AA(RGBA8, src = muldiv256(src, aa)); \ + FALLTHROUGH; \ + case MASK_##key: \ + src = muldiv255(src, load_clip_mask(buf, span)); \ + FALLTHROUGH; \ + case key: key + +#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__)) + + switch (blend_key) { + BLEND_CASE(GL_ONE, GL_ZERO): + return src; + BLEND_CASE(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, + GL_ONE_MINUS_SRC_ALPHA): + // dst + src.a*(src.rgb1 - dst) + // use addlow for signed overflow + return addlow(dst, muldiv255(alphas(src), (src | ALPHA_OPAQUE) - dst)); + BLEND_CASE(GL_ONE, GL_ONE_MINUS_SRC_ALPHA): + return src + dst - muldiv255(dst, alphas(src)); + BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR): + return dst - muldiv255(dst, src); + BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE): + return dst - (muldiv255(dst, src) & RGB_MASK); + BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA): + return dst - muldiv255(dst, alphas(src)); + BLEND_CASE(GL_ZERO, GL_SRC_COLOR): + return muldiv255(src, dst); + BLEND_CASE(GL_ONE, GL_ONE): + return src + dst; + BLEND_CASE(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA): + return src + dst - (muldiv255(dst, src) & ALPHA_MASK); + BLEND_CASE(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE): + // src*(1-dst.a) + dst*1 = src - src*dst.a + dst + return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK); + BLEND_CASE(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR): + // src*k + (1-src)*dst = src*k + dst - + // src*dst = dst + src*(k - dst) use addlow + // for signed overflow + return addlow( + dst, muldiv255(src, repeat2(ctx->blendcolor) - dst)); + + // We must explicitly handle the masked/anti-aliased secondary blend case. + // The secondary color as well as the source must be multiplied by the + // weights. + case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { + WideRGBA8 secondary = + applyColor(dst, + packColor(fragment_shader->gl_SecondaryFragColor)); + return src + dst - secondary; + } + case MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { + WideRGBA8 secondary = + applyColor(dst, + packColor(fragment_shader->gl_SecondaryFragColor)); + WideRGBA8 mask = load_clip_mask(buf, span); + return muldiv255(src, mask) + dst - muldiv255(secondary, mask); + } + case AA_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { + WideRGBA8 secondary = + applyColor(dst, + packColor(fragment_shader->gl_SecondaryFragColor)); + DO_AA(RGBA8, { + src = muldiv256(src, aa); + secondary = muldiv256(secondary, aa); + }); + return src + dst - secondary; + } + case AA_MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { + WideRGBA8 secondary = + applyColor(dst, + packColor(fragment_shader->gl_SecondaryFragColor)); + WideRGBA8 mask = load_clip_mask(buf, span); + DO_AA(RGBA8, mask = muldiv256(mask, aa)); + return muldiv255(src, mask) + dst - muldiv255(secondary, mask); + } + + BLEND_CASE(GL_MIN): + return min(src, dst); + BLEND_CASE(GL_MAX): + return max(src, dst); + + // The KHR_blend_equation_advanced spec describes the blend equations such + // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to + // the result: + // Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As) + // Ar = As*Ad + As*(1-Ad) + Ad*(1-As) + // However, working with unpremultiplied values requires expensive math to + // unpremultiply and premultiply again during blending. We can use the fact + // that premultiplied value P = C*A and simplify the equations such that no + // unpremultiplied colors are necessary, allowing us to stay with integer + // math that avoids floating-point conversions in the common case. Some of + // the blend modes require division or sqrt, in which case we do convert + // to (possibly transposed/unpacked) floating-point to implement the mode. + // However, most common modes can still use cheaper premultiplied integer + // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified + // to: + // Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As) + // .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As + // Ar = As*Ad + As - As*Ad + Ad - Ad*As + // .. Ar = As + Ad - As*Ad + // Note that the alpha equation is the same for all blend equations, such + // that so long as the implementation results in As + Ad - As*Ad, we can + // avoid using separate instructions to compute the alpha result, which is + // dependent on the math used to implement each blend mode. The exact + // reductions used to get the final math for every blend mode are too + // involved to show here in comments, but mostly follows from replacing + // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms + // as possible. + + BLEND_CASE(GL_MULTIPLY_KHR): { + WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK), + alphas(dst) - (dst & RGB_MASK)); + return src + dst + (diff & RGB_MASK) - alphas(diff); + } + BLEND_CASE(GL_SCREEN_KHR): + return src + dst - muldiv255(src, dst); + BLEND_CASE(GL_OVERLAY_KHR): { + WideRGBA8 srcA = alphas(src); + WideRGBA8 dstA = alphas(dst); + WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); + return src + dst + + if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff), + -diff); + } + BLEND_CASE(GL_DARKEN_KHR): + return src + dst - + max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); + BLEND_CASE(GL_LIGHTEN_KHR): + return src + dst - + min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); + + BLEND_CASE(GL_COLORDODGE_KHR): { + // Color-dodge and color-burn require division, so we convert to FP math + // here, but avoid transposing to a vec4. + WideRGBA32F srcF = CONVERT(src, WideRGBA32F); + WideRGBA32F srcA = alphas(srcF); + WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); + WideRGBA32F dstA = alphas(dstF); + return pack_pixels_RGBA8( + srcA * set_alphas( + min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)), + dstF) + + srcF * (255.0f - dstA) + dstF * (255.0f - srcA), + 1.0f / 255.0f); + } + BLEND_CASE(GL_COLORBURN_KHR): { + WideRGBA32F srcF = CONVERT(src, WideRGBA32F); + WideRGBA32F srcA = alphas(srcF); + WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); + WideRGBA32F dstA = alphas(dstF); + return pack_pixels_RGBA8( + srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA * + recip_or(srcF, 255.0f))), + dstF) + + srcF * (255.0f - dstA) + dstF * (255.0f - srcA), + 1.0f / 255.0f); + } + BLEND_CASE(GL_HARDLIGHT_KHR): { + WideRGBA8 srcA = alphas(src); + WideRGBA8 dstA = alphas(dst); + WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); + return src + dst + + if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff), + -diff); + } + + BLEND_CASE(GL_SOFTLIGHT_KHR): { + // Soft-light requires an unpremultiply that can't be factored out as + // well as a sqrt, so we convert to FP math here, but avoid transposing + // to a vec4. + WideRGBA32F srcF = CONVERT(src, WideRGBA32F); + WideRGBA32F srcA = alphas(srcF); + WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); + WideRGBA32F dstA = alphas(dstF); + WideRGBA32F dstU = unpremultiply(dstF); + WideRGBA32F scale = srcF + srcF - srcA; + return pack_pixels_RGBA8( + dstF * (255.0f + + set_alphas( + scale * + if_then_else(scale < 0.0f, 1.0f - dstU, + min((16.0f * dstU - 12.0f) * dstU + 3.0f, + inversesqrt(dstU) - 1.0f)), + WideRGBA32F(0.0f))) + + srcF * (255.0f - dstA), + 1.0f / 255.0f); + } + BLEND_CASE(GL_DIFFERENCE_KHR): { + WideRGBA8 diff = + min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst))); + return src + dst - diff - (diff & RGB_MASK); + } + BLEND_CASE(GL_EXCLUSION_KHR): { + WideRGBA8 diff = muldiv255(src, dst); + return src + dst - diff - (diff & RGB_MASK); + } + + // The HSL blend modes are non-separable and require complicated use of + // division. It is advantageous to convert to FP and transpose to vec4 + // math to more easily manipulate the individual color components. +#define DO_HSL(rgb) \ + do { \ + vec4 srcV = unpack(CONVERT(src, PackedRGBA32F)); \ + vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F)); \ + Float srcA = srcV.w * (1.0f / 255.0f); \ + Float dstA = dstV.w * (1.0f / 255.0f); \ + Float srcDstA = srcV.w * dstA; \ + vec3 srcC = vec3(srcV) * dstA; \ + vec3 dstC = vec3(dstV) * srcA; \ + return pack_pixels_RGBA8(vec4(rgb + vec3(srcV) - srcC + vec3(dstV) - dstC, \ + srcV.w + dstV.w - srcDstA), \ + 1.0f); \ + } while (0) + + BLEND_CASE(GL_HSL_HUE_KHR): + DO_HSL(set_lum_sat(srcC, dstC, dstC, srcDstA)); + BLEND_CASE(GL_HSL_SATURATION_KHR): + DO_HSL(set_lum_sat(dstC, srcC, dstC, srcDstA)); + BLEND_CASE(GL_HSL_COLOR_KHR): + DO_HSL(set_lum(srcC, dstC, srcDstA)); + BLEND_CASE(GL_HSL_LUMINOSITY_KHR): + DO_HSL(set_lum(dstC, srcC, srcDstA)); + + // SWGL-specific extended blend modes. + BLEND_CASE(SWGL_BLEND_DROP_SHADOW): { + // Premultiplied alpha over blend, but with source color set to source alpha + // modulated with a constant color. + WideRGBA8 color = applyColor(alphas(src), swgl_BlendColorRGBA8); + return color + dst - muldiv255(dst, alphas(color)); + } + + BLEND_CASE(SWGL_BLEND_SUBPIXEL_TEXT): + // Premultiplied alpha over blend, but treats the source as a subpixel mask + // modulated with a constant color. + return applyColor(src, swgl_BlendColorRGBA8) + dst - + muldiv255(dst, applyColor(src, swgl_BlendAlphaRGBA8)); + + default: + UNREACHABLE; + // return src; + } + +#undef BLEND_CASE +#undef BLEND_CASE_KEY + // clang-format on +} + +static PREFER_INLINE WideR8 blend_pixels(uint8_t* buf, WideR8 dst, WideR8 src, + int span = 4) { + // clang-format off +#define BLEND_CASE_KEY(key) \ + case AA_##key: \ + DO_AA(R8, src = muldiv256(src, aa)); \ + goto key; \ + case AA_MASK_##key: \ + DO_AA(R8, src = muldiv256(src, aa)); \ + FALLTHROUGH; \ + case MASK_##key: \ + src = muldiv255(src, load_clip_mask(buf, span)); \ + FALLTHROUGH; \ + case key: key + +#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__)) + + switch (blend_key) { + BLEND_CASE(GL_ONE, GL_ZERO): + return src; + BLEND_CASE(GL_ZERO, GL_SRC_COLOR): + return muldiv255(src, dst); + BLEND_CASE(GL_ONE, GL_ONE): + return src + dst; + default: + UNREACHABLE; + // return src; + } + +#undef BLEND_CASE +#undef BLEND_CASE_KEY + // clang-format on +} + +static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r) { + unaligned_store(buf, pack(r)); +} + +static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r, int len) { + partial_store_span(buf, pack(r), len); +} + +static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r) { + return blend_pixels(buf, unaligned_load(buf), r); +} + +static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r, int len) { + return blend_pixels(buf, partial_load_span(buf, len), r, len); +} + +static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r) { + unaligned_store(buf, r); +} + +static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r, int len) { + partial_store_span(buf, r, len); +} + +static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r) { + return pack(blend_span(buf, unpack(r))); +} + +static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r, + int len) { + return pack(blend_span(buf, unpack(r), len)); +} + +static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r) { + unaligned_store(buf, pack(r)); +} + +static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r, int len) { + partial_store_span(buf, pack(r), len); +} + +static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r) { + return blend_pixels(buf, unpack(unaligned_load(buf)), r); +} + +static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r, int len) { + return blend_pixels(buf, unpack(partial_load_span(buf, len)), r, + len); +} + +static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r) { + unaligned_store(buf, r); +} + +static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r, int len) { + partial_store_span(buf, r, len); +} + +static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r) { + return pack(blend_span(buf, unpack(r))); +} + +static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r, int len) { + return pack(blend_span(buf, unpack(r), len)); +} + +template +static ALWAYS_INLINE void commit_blend_span(P* buf, R r) { + if (BLEND) { + commit_span(buf, blend_span(buf, r)); + } else { + commit_span(buf, r); + } +} + +template +static ALWAYS_INLINE void commit_blend_span(P* buf, R r, int len) { + if (BLEND) { + commit_span(buf, blend_span(buf, r, len), len); + } else { + commit_span(buf, r, len); + } +} + +template +static ALWAYS_INLINE void commit_blend_solid_span(P* buf, R r, int len) { + for (P* end = &buf[len & ~3]; buf < end; buf += 4) { + commit_span(buf, blend_span(buf, r)); + } + len &= 3; + if (len > 0) { + partial_store_span(buf, pack(blend_span(buf, r, len)), len); + } +} + +template +static void commit_solid_span(uint32_t* buf, WideRGBA8 r, int len) { + commit_blend_solid_span(buf, r, len); +} + +template <> +ALWAYS_INLINE void commit_solid_span(uint32_t* buf, WideRGBA8 r, + int len) { + fill_n(buf, len, bit_cast(pack(r)).x); +} + +template +static void commit_solid_span(uint8_t* buf, WideR8 r, int len) { + commit_blend_solid_span(buf, r, len); +} + +template <> +ALWAYS_INLINE void commit_solid_span(uint8_t* buf, WideR8 r, int len) { + PackedR8 p = pack(r); + if (uintptr_t(buf) & 3) { + int align = 4 - (uintptr_t(buf) & 3); + align = min(align, len); + partial_store_span(buf, p, align); + buf += align; + len -= align; + } + fill_n((uint32_t*)buf, len / 4, bit_cast(p)); + buf += len & ~3; + len &= 3; + if (len > 0) { + partial_store_span(buf, p, len); + } +} diff --git a/gfx/wr/swgl/src/composite.h b/gfx/wr/swgl/src/composite.h new file mode 100644 index 0000000000..70acabeca4 --- /dev/null +++ b/gfx/wr/swgl/src/composite.h @@ -0,0 +1,1386 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Converts a pixel from a source format to a destination format. By default, +// just return the value unchanged as for a simple copy. +template +static ALWAYS_INLINE P convert_pixel(U src) { + return src; +} + +// R8 format maps to BGRA value 0,0,R,1. The byte order is endian independent, +// but the shifts unfortunately depend on endianness. +template <> +ALWAYS_INLINE uint32_t convert_pixel(uint8_t src) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return (uint32_t(src) << 16) | 0xFF000000; +#else + return (uint32_t(src) << 8) | 0x000000FF; +#endif +} + +// RG8 format maps to BGRA value 0,G,R,1. +template <> +ALWAYS_INLINE uint32_t convert_pixel(uint16_t src) { + uint32_t rg = src; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return ((rg & 0x00FF) << 16) | (rg & 0xFF00) | 0xFF000000; +#else + return (rg & 0xFF00) | ((rg & 0x00FF) << 16) | 0x000000FF; +#endif +} + +// RGBA8 format maps to R. +template <> +ALWAYS_INLINE uint8_t convert_pixel(uint32_t src) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return (src >> 16) & 0xFF; +#else + return (src >> 8) & 0xFF; +#endif +} + +// RGBA8 formats maps to R,G. +template <> +ALWAYS_INLINE uint16_t convert_pixel(uint32_t src) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return ((src >> 16) & 0x00FF) | (src & 0xFF00); +#else + return (src & 0xFF00) | ((src >> 16) & 0x00FF); +#endif +} + +// R8 format maps to R,0. +template <> +ALWAYS_INLINE uint16_t convert_pixel(uint8_t src) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return src; +#else + return uint16_t(src) << 8; +#endif +} + +// RG8 format maps to R. +template <> +ALWAYS_INLINE uint8_t convert_pixel(uint16_t src) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return src & 0xFF; +#else + return src >> 8; +#endif +} + +template +static inline void copy_row(P* dst, const P* src, int span) { + // No scaling, so just do a fast copy. + memcpy(dst, src, span * sizeof(P)); +} + +template <> +void copy_row(uint32_t* dst, const uint32_t* src, int span) { + // No scaling, so just do a fast composite. + auto* end = dst + span; + while (dst + 4 <= end) { + WideRGBA8 srcpx = unpack(unaligned_load(src)); + WideRGBA8 dstpx = unpack(unaligned_load(dst)); + PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + unaligned_store(dst, r); + src += 4; + dst += 4; + } + if (dst < end) { + WideRGBA8 srcpx = unpack(partial_load_span(src, end - dst)); + WideRGBA8 dstpx = unpack(partial_load_span(dst, end - dst)); + auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + partial_store_span(dst, r, end - dst); + } +} + +template +static inline void scale_row(P* dst, int dstWidth, const U* src, int srcWidth, + int span, int frac) { + // Do scaling with different source and dest widths. + for (P* end = dst + span; dst < end; dst++) { + *dst = convert_pixel

(*src); + // Step source according to width ratio. + for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { + src++; + } + } +} + +template <> +void scale_row(uint32_t* dst, int dstWidth, + const uint32_t* src, int srcWidth, + int span, int frac) { + // Do scaling with different source and dest widths. + // Gather source pixels four at a time for better packing. + auto* end = dst + span; + for (; dst + 4 <= end; dst += 4) { + U32 srcn; + srcn.x = *src; + for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { + src++; + } + srcn.y = *src; + for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { + src++; + } + srcn.z = *src; + for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { + src++; + } + srcn.w = *src; + for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { + src++; + } + WideRGBA8 srcpx = unpack(bit_cast(srcn)); + WideRGBA8 dstpx = unpack(unaligned_load(dst)); + PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + unaligned_store(dst, r); + } + if (dst < end) { + // Process any remaining pixels. Try to gather as many pixels as possible + // into a single source chunk for compositing. + U32 srcn = {*src, 0, 0, 0}; + if (end - dst > 1) { + for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { + src++; + } + srcn.y = *src; + if (end - dst > 2) { + for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { + src++; + } + srcn.z = *src; + } + } + WideRGBA8 srcpx = unpack(bit_cast(srcn)); + WideRGBA8 dstpx = unpack(partial_load_span(dst, end - dst)); + auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + partial_store_span(dst, r, end - dst); + } +} + +template +static NO_INLINE void scale_blit(Texture& srctex, const IntRect& srcReq, + Texture& dsttex, const IntRect& dstReq, + bool invertY, const IntRect& clipRect) { + assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 && + dsttex.internal_format == GL_RGBA8)); + // Cache scaling ratios + int srcWidth = srcReq.width(); + int srcHeight = srcReq.height(); + int dstWidth = dstReq.width(); + int dstHeight = dstReq.height(); + // Compute valid dest bounds + IntRect dstBounds = dsttex.sample_bounds(dstReq).intersect(clipRect); + // Compute valid source bounds + IntRect srcBounds = srctex.sample_bounds(srcReq, invertY); + // If srcReq is outside the source texture, we need to clip the sampling + // bounds so that we never sample outside valid source bounds. Get texture + // bounds relative to srcReq and scale to dest-space rounding inward, using + // this rect to limit the dest bounds further. + IntRect srcClip = srctex.bounds() - srcReq.origin(); + if (invertY) { + srcClip.invert_y(srcReq.height()); + } + srcClip.scale(srcWidth, srcHeight, dstWidth, dstHeight, true); + dstBounds.intersect(srcClip); + // Check if clipped sampling bounds are empty + if (dstBounds.is_empty()) { + return; + } + + // Calculate source and dest pointers from clamped offsets + int srcStride = srctex.stride(); + int destStride = dsttex.stride(); + char* dest = dsttex.sample_ptr(dstReq, dstBounds); + // Clip the source bounds by the destination offset. + int fracX = srcWidth * dstBounds.x0; + int fracY = srcHeight * dstBounds.y0; + srcBounds.x0 = max(fracX / dstWidth, srcBounds.x0); + srcBounds.y0 = max(fracY / dstHeight, srcBounds.y0); + fracX %= dstWidth; + fracY %= dstHeight; + char* src = srctex.sample_ptr(srcReq, srcBounds, invertY); + // Inverted Y must step downward along source rows + if (invertY) { + srcStride = -srcStride; + } + int span = dstBounds.width(); + for (int rows = dstBounds.height(); rows > 0; rows--) { + switch (srctex.bpp()) { + case 1: + switch (dsttex.bpp()) { + case 2: + scale_row((uint16_t*)dest, dstWidth, (uint8_t*)src, + srcWidth, span, fracX); + break; + case 4: + scale_row((uint32_t*)dest, dstWidth, (uint8_t*)src, + srcWidth, span, fracX); + break; + default: + if (srcWidth == dstWidth) + copy_row((uint8_t*)dest, (uint8_t*)src, span); + else + scale_row((uint8_t*)dest, dstWidth, (uint8_t*)src, + srcWidth, span, fracX); + break; + } + break; + case 2: + switch (dsttex.bpp()) { + case 1: + scale_row((uint8_t*)dest, dstWidth, (uint16_t*)src, + srcWidth, span, fracX); + break; + case 4: + scale_row((uint32_t*)dest, dstWidth, (uint16_t*)src, + srcWidth, span, fracX); + break; + default: + if (srcWidth == dstWidth) + copy_row((uint16_t*)dest, (uint16_t*)src, span); + else + scale_row((uint16_t*)dest, dstWidth, (uint16_t*)src, + srcWidth, span, fracX); + break; + } + break; + case 4: + switch (dsttex.bpp()) { + case 1: + scale_row((uint8_t*)dest, dstWidth, (uint32_t*)src, + srcWidth, span, fracX); + break; + case 2: + scale_row((uint16_t*)dest, dstWidth, (uint32_t*)src, + srcWidth, span, fracX); + break; + default: + if (srcWidth == dstWidth) + copy_row((uint32_t*)dest, (uint32_t*)src, span); + else + scale_row((uint32_t*)dest, dstWidth, (uint32_t*)src, + srcWidth, span, fracX); + break; + } + break; + default: + assert(false); + break; + } + dest += destStride; + // Step source according to height ratio. + for (fracY += srcHeight; fracY >= dstHeight; fracY -= dstHeight) { + src += srcStride; + } + } +} + +template +static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV, + float srcDU, sampler2D sampler) { + vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); + for (; span >= 4; span -= 4) { + auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv)); + unaligned_store(dest, srcpx); + dest += 4; + uv.x += 4 * srcDU; + } + if (span > 0) { + auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv)); + partial_store_span(dest, srcpx, span); + } +} + +template <> +void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV, + float srcDU, sampler2D sampler) { + vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); + for (; span >= 4; span -= 4) { + WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv)); + WideRGBA8 dstpx = unpack(unaligned_load(dest)); + PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + unaligned_store(dest, r); + + dest += 4; + uv.x += 4 * srcDU; + } + if (span > 0) { + WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv)); + WideRGBA8 dstpx = unpack(partial_load_span(dest, span)); + PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + partial_store_span(dest, r, span); + } +} + +template +static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV, + float srcDU, sampler2D sampler) { + vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); + for (; span >= 4; span -= 4) { + auto srcpx = textureLinearPackedR8(sampler, ivec2(uv)); + unaligned_store(dest, srcpx); + dest += 4; + uv.x += 4 * srcDU; + } + if (span > 0) { + auto srcpx = textureLinearPackedR8(sampler, ivec2(uv)); + partial_store_span(dest, srcpx, span); + } +} + +template +static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV, + float srcDU, sampler2D sampler) { + vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); + for (; span >= 4; span -= 4) { + auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv)); + unaligned_store(dest, srcpx); + dest += 4; + uv.x += 4 * srcDU; + } + if (span > 0) { + auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv)); + partial_store_span(dest, srcpx, span); + } +} + +template +static NO_INLINE void linear_blit(Texture& srctex, const IntRect& srcReq, + Texture& dsttex, const IntRect& dstReq, + bool invertX, bool invertY, + const IntRect& clipRect) { + assert(srctex.internal_format == GL_RGBA8 || + srctex.internal_format == GL_R8 || srctex.internal_format == GL_RG8); + assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 && + dsttex.internal_format == GL_RGBA8)); + // Compute valid dest bounds + IntRect dstBounds = dsttex.sample_bounds(dstReq); + dstBounds.intersect(clipRect); + // Check if sampling bounds are empty + if (dstBounds.is_empty()) { + return; + } + // Initialize sampler for source texture + sampler2D_impl sampler; + init_sampler(&sampler, srctex); + sampler.filter = TextureFilter::LINEAR; + // Compute source UVs + vec2_scalar srcUV(srcReq.x0, srcReq.y0); + vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), + float(srcReq.height()) / dstReq.height()); + if (invertX) { + // Advance to the end of the row and flip the step. + srcUV.x += srcReq.width(); + srcDUV.x = -srcDUV.x; + } + // Inverted Y must step downward along source rows + if (invertY) { + srcUV.y += srcReq.height(); + srcDUV.y = -srcDUV.y; + } + // Skip to clamped source start + srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f); + // Scale UVs by lerp precision + srcUV = linearQuantize(srcUV, 128); + srcDUV *= 128.0f; + // Calculate dest pointer from clamped offsets + int bpp = dsttex.bpp(); + int destStride = dsttex.stride(); + char* dest = dsttex.sample_ptr(dstReq, dstBounds); + int span = dstBounds.width(); + for (int rows = dstBounds.height(); rows > 0; rows--) { + switch (bpp) { + case 1: + linear_row_blit((uint8_t*)dest, span, srcUV, srcDUV.x, + &sampler); + break; + case 2: + linear_row_blit((uint16_t*)dest, span, srcUV, srcDUV.x, + &sampler); + break; + case 4: + linear_row_blit((uint32_t*)dest, span, srcUV, srcDUV.x, + &sampler); + break; + default: + assert(false); + break; + } + dest += destStride; + srcUV.y += srcDUV.y; + } +} + +// Whether the blit format is renderable. +static inline bool is_renderable_format(GLenum format) { + switch (format) { + case GL_R8: + case GL_RG8: + case GL_RGBA8: + return true; + default: + return false; + } +} + +extern "C" { + +void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, + GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, + GLbitfield mask, GLenum filter) { + assert(mask == GL_COLOR_BUFFER_BIT); + Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER); + if (!srcfb) return; + Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER); + if (!dstfb) return; + Texture& srctex = ctx->textures[srcfb->color_attachment]; + if (!srctex.buf) return; + Texture& dsttex = ctx->textures[dstfb->color_attachment]; + if (!dsttex.buf) return; + assert(!dsttex.locked); + if (srctex.internal_format != dsttex.internal_format && + (!is_renderable_format(srctex.internal_format) || + !is_renderable_format(dsttex.internal_format))) { + // If the internal formats don't match, then we may have to convert. Require + // that the format is a simple renderable format to limit combinatoric + // explosion for now. + assert(false); + return; + } + // Force flipped Y onto dest coordinates + if (srcY1 < srcY0) { + swap(srcY0, srcY1); + swap(dstY0, dstY1); + } + bool invertY = dstY1 < dstY0; + if (invertY) { + swap(dstY0, dstY1); + } + IntRect srcReq = IntRect{srcX0, srcY0, srcX1, srcY1} - srctex.offset; + IntRect dstReq = IntRect{dstX0, dstY0, dstX1, dstY1} - dsttex.offset; + if (srcReq.is_empty() || dstReq.is_empty()) { + return; + } + IntRect clipRect = {0, 0, dstReq.width(), dstReq.height()}; + prepare_texture(srctex); + prepare_texture(dsttex, &dstReq); + if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR && + srctex.internal_format == dsttex.internal_format && + is_renderable_format(srctex.internal_format)) { + linear_blit(srctex, srcReq, dsttex, dstReq, false, invertY, dstReq); + } else { + scale_blit(srctex, srcReq, dsttex, dstReq, invertY, clipRect); + } +} + +typedef Texture LockedTexture; + +// Lock the given texture to prevent modification. +LockedTexture* LockTexture(GLuint texId) { + Texture& tex = ctx->textures[texId]; + if (!tex.buf) { + assert(tex.buf != nullptr); + return nullptr; + } + if (__sync_fetch_and_add(&tex.locked, 1) == 0) { + // If this is the first time locking the texture, flush any delayed clears. + prepare_texture(tex); + } + return (LockedTexture*)&tex; +} + +// Lock the given framebuffer's color attachment to prevent modification. +LockedTexture* LockFramebuffer(GLuint fboId) { + Framebuffer& fb = ctx->framebuffers[fboId]; + // Only allow locking a framebuffer if it has a valid color attachment. + if (!fb.color_attachment) { + assert(fb.color_attachment != 0); + return nullptr; + } + return LockTexture(fb.color_attachment); +} + +// Reference an already locked resource +void LockResource(LockedTexture* resource) { + if (!resource) { + return; + } + __sync_fetch_and_add(&resource->locked, 1); +} + +// Remove a lock on a texture that has been previously locked +void UnlockResource(LockedTexture* resource) { + if (!resource) { + return; + } + if (__sync_fetch_and_add(&resource->locked, -1) <= 0) { + // The lock should always be non-zero before unlocking. + assert(0); + } +} + +// Get the underlying buffer for a locked resource +void* GetResourceBuffer(LockedTexture* resource, int32_t* width, + int32_t* height, int32_t* stride) { + *width = resource->width; + *height = resource->height; + *stride = resource->stride(); + return resource->buf; +} + +// Extension for optimized compositing of textures or framebuffers that may be +// safely used across threads. The source and destination must be locked to +// ensure that they can be safely accessed while the SWGL context might be used +// by another thread. Band extents along the Y axis may be used to clip the +// destination rectangle without effecting the integer scaling ratios. +void Composite(LockedTexture* lockedDst, LockedTexture* lockedSrc, GLint srcX, + GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX, + GLint dstY, GLsizei dstWidth, GLsizei dstHeight, + GLboolean opaque, GLboolean flipX, GLboolean flipY, + GLenum filter, GLint clipX, GLint clipY, GLsizei clipWidth, + GLsizei clipHeight) { + if (!lockedDst || !lockedSrc) { + return; + } + Texture& srctex = *lockedSrc; + Texture& dsttex = *lockedDst; + assert(srctex.bpp() == 4); + assert(dsttex.bpp() == 4); + + IntRect srcReq = + IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - srctex.offset; + IntRect dstReq = + IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset; + if (srcReq.is_empty() || dstReq.is_empty()) { + return; + } + + // Compute clip rect as relative to the dstReq, as that's the same coords + // as used for the sampling bounds. + IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth, + clipY - dstY + clipHeight}; + // Ensure we have rows of at least 2 pixels when using the linear filter to + // avoid overreading the row. Force X flips onto the linear filter for now + // until scale_blit supports it. + bool useLinear = + srctex.width >= 2 && + (flipX || (!srcReq.same_size(dstReq) && filter == GL_LINEAR)); + + if (opaque) { + if (useLinear) { + linear_blit(srctex, srcReq, dsttex, dstReq, flipX, flipY, + clipRect); + } else { + scale_blit(srctex, srcReq, dsttex, dstReq, flipY, clipRect); + } + } else { + if (useLinear) { + linear_blit(srctex, srcReq, dsttex, dstReq, flipX, flipY, clipRect); + } else { + scale_blit(srctex, srcReq, dsttex, dstReq, flipY, clipRect); + } + } +} + +} // extern "C" + +// Saturated add helper for YUV conversion. Supported platforms have intrinsics +// to do this natively, but support a slower generic fallback just in case. +static inline V8 addsat(V8 x, V8 y) { +#if USE_SSE2 + return _mm_adds_epi16(x, y); +#elif USE_NEON + return vqaddq_s16(x, y); +#else + auto r = x + y; + // An overflow occurred if the signs of both inputs x and y did not differ + // but yet the sign of the result did differ. + auto overflow = (~(x ^ y) & (r ^ x)) >> 15; + // If there was an overflow, we need to choose the appropriate limit to clamp + // to depending on whether or not the inputs are negative. + auto limit = (x >> 15) ^ 0x7FFF; + // If we didn't overflow, just use the result, and otherwise, use the limit. + return (~overflow & r) | (overflow & limit); +#endif +} + +// Interleave and packing helper for YUV conversion. During transform by the +// color matrix, the color components are de-interleaved as this format is +// usually what comes out of the planar YUV textures. The components thus need +// to be interleaved before finally getting packed to BGRA format. Alpha is +// forced to be opaque. +static inline PackedRGBA8 packYUV(V8 gg, V8 br) { + return pack(bit_cast(zip(br, gg))) | + PackedRGBA8{0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; +} + +// clang-format off +// Supports YUV color matrixes of the form: +// [R] [1.1643835616438356, 0.0, rv ] [Y - 16] +// [G] = [1.1643835616438358, -gu, -gv ] x [U - 128] +// [B] [1.1643835616438356, bu, 0.0 ] [V - 128] +// We must be able to multiply a YUV input by a matrix coefficient ranging as +// high as ~2.2 in the U/V cases, where U/V can be signed values between -128 +// and 127. The largest fixed-point representation we can thus support without +// overflowing 16 bit integers leaves us 6 bits of fractional precision while +// also supporting a sign bit. The closest representation of the Y coefficient +// ~1.164 in this precision is 74.5/2^6 which is common to all color spaces +// we support. Conversions can still sometimes overflow the precision and +// require clamping back into range, so we use saturated additions to do this +// efficiently at no extra cost. +// clang-format on +struct YUVMatrix { + // These constants are loaded off the "this" pointer via relative addressing + // modes and should be about as quick to load as directly addressed SIMD + // constant memory. + + V8 br_uvCoeffs; // biased by 6 bits [b_from_u, r_from_v, repeats] + V8 gg_uvCoeffs; // biased by 6 bits [g_from_u, g_from_v, repeats] + V8 yCoeffs; // biased by 7 bits + V8 yBias; // 0 or 16 + V8 uvBias; // 128 + V8 br_yMask; + + // E.g. rec709-narrow: + // [ 1.16, 0, 1.79, -0.97 ] + // [ 1.16, -0.21, -0.53, 0.30 ] + // [ 1.16, 2.11, 0, -1.13 ] + // = + // [ yScale, 0, r_from_v ] ([Y ] ) + // [ yScale, g_from_u, g_from_v ] x ([cb] - ycbcr_bias ) + // [ yScale, b_from_u, 0 ] ([cr] ) + static YUVMatrix From(const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescale_factor = 0) { + assert(ycbcr_bias.z == ycbcr_bias.y); + + const auto rgb_from_y = rgb_from_debiased_ycbcr[0].y; + assert(rgb_from_debiased_ycbcr[0].x == rgb_from_debiased_ycbcr[0].z); + + int16_t br_from_y_mask = -1; + if (rgb_from_debiased_ycbcr[0].x == 0.0) { + // gbr-identity matrix? + assert(rgb_from_debiased_ycbcr[0].x == 0); + assert(rgb_from_debiased_ycbcr[0].y >= 1); + assert(rgb_from_debiased_ycbcr[0].z == 0); + + assert(rgb_from_debiased_ycbcr[1].x == 0); + assert(rgb_from_debiased_ycbcr[1].y == 0); + assert(rgb_from_debiased_ycbcr[1].z >= 1); + + assert(rgb_from_debiased_ycbcr[2].x >= 1); + assert(rgb_from_debiased_ycbcr[2].y == 0); + assert(rgb_from_debiased_ycbcr[2].z == 0); + + assert(ycbcr_bias.x == 0); + assert(ycbcr_bias.y == 0); + assert(ycbcr_bias.z == 0); + + br_from_y_mask = 0; + } else { + assert(rgb_from_debiased_ycbcr[0].x == rgb_from_y); + } + + assert(rgb_from_debiased_ycbcr[1].x == 0.0); + const auto g_from_u = rgb_from_debiased_ycbcr[1].y; + const auto b_from_u = rgb_from_debiased_ycbcr[1].z; + + const auto r_from_v = rgb_from_debiased_ycbcr[2].x; + const auto g_from_v = rgb_from_debiased_ycbcr[2].y; + assert(rgb_from_debiased_ycbcr[2].z == 0.0); + + return YUVMatrix({ycbcr_bias.x, ycbcr_bias.y}, rgb_from_y, br_from_y_mask, + r_from_v, g_from_u, g_from_v, b_from_u, rescale_factor); + } + + // Convert matrix coefficients to fixed-point representation. If the matrix + // has a rescaling applied to it, then we need to take care to undo the + // scaling so that we can convert the coefficients to fixed-point range. The + // bias still requires shifting to apply the rescaling. The rescaling will be + // applied to the actual YCbCr sample data later by manually shifting it + // before applying this matrix. + YUVMatrix(vec2_scalar yuv_bias, double yCoeff, int16_t br_yMask_, double rv, + double gu, double gv, double bu, int rescale_factor = 0) + : br_uvCoeffs(zip(I16(int16_t(bu * (1 << (6 - rescale_factor)) + 0.5)), + I16(int16_t(rv * (1 << (6 - rescale_factor)) + 0.5)))), + gg_uvCoeffs( + zip(I16(-int16_t(-gu * (1 << (6 - rescale_factor)) + + 0.5)), // These are negative coeffs, so + // round them away from zero + I16(-int16_t(-gv * (1 << (6 - rescale_factor)) + 0.5)))), + yCoeffs(uint16_t(yCoeff * (1 << (6 + 1 - rescale_factor)) + 0.5)), + // We have a +0.5 fudge-factor for -ybias. + // Without this, we get white=254 not 255. + // This approximates rounding rather than truncation during `gg >>= 6`. + yBias(int16_t(((yuv_bias.x * 255 * yCoeff) - 0.5) * (1 << 6))), + uvBias(int16_t(yuv_bias.y * (255 << rescale_factor) + 0.5)), + br_yMask(br_yMask_) { + assert(yuv_bias.x >= 0); + assert(yuv_bias.y >= 0); + assert(yCoeff > 0); + assert(br_yMask_ == 0 || br_yMask_ == -1); + assert(bu > 0); + assert(rv > 0); + assert(gu <= 0); + assert(gv <= 0); + assert(rescale_factor <= 6); + } + + ALWAYS_INLINE PackedRGBA8 convert(V8 yy, V8 uv) const { + // We gave ourselves an extra bit (7 instead of 6) of bias to give us some + // extra precision for the more-sensitive y scaling. + // Note that we have to use an unsigned multiply with a 2x scale to + // represent a fractional scale and to avoid shifting with the sign bit. + + // Note: if you subtract the bias before multiplication, we see more + // underflows. This could be fixed by an unsigned subsat. + yy = bit_cast>((bit_cast>(yy) * yCoeffs) >> 1); + yy -= yBias; + + // Compute [B] = [yCoeff*Y + bu*U + 0*V] + // [R] [yCoeff*Y + 0*U + rv*V] + uv -= uvBias; + auto br = br_uvCoeffs * uv; + br = addsat(yy & br_yMask, br); + br >>= 6; + + // Compute G = yCoeff*Y + gu*U + gv*V + // First calc [gu*U, gv*V, ...]: + auto gg = gg_uvCoeffs * uv; + // Then cross the streams to get `gu*U + gv*V`: + gg = addsat(gg, bit_cast>(bit_cast>(gg) >> 16)); + // Add the other parts: + gg = addsat(yy, gg); // This is the part that needs the most headroom + // usually. In particular, ycbcr(255,255,255) hugely + // saturates. + gg >>= 6; + + // Interleave B/R and G values. Force alpha (high-gg half) to opaque. + return packYUV(gg, br); + } +}; + +// Helper function for textureLinearRowR8 that samples horizontal taps and +// combines them based on Y fraction with next row. +template +static ALWAYS_INLINE V8 linearRowTapsR8(S sampler, I32 ix, + int32_t offsety, + int32_t stridey, + int16_t fracy) { + uint8_t* buf = (uint8_t*)sampler->buf + offsety; + auto a0 = unaligned_load>(&buf[ix.x]); + auto b0 = unaligned_load>(&buf[ix.y]); + auto c0 = unaligned_load>(&buf[ix.z]); + auto d0 = unaligned_load>(&buf[ix.w]); + auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8); + buf += stridey; + auto a1 = unaligned_load>(&buf[ix.x]); + auto b1 = unaligned_load>(&buf[ix.y]); + auto c1 = unaligned_load>(&buf[ix.z]); + auto d1 = unaligned_load>(&buf[ix.w]); + auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8); + abcd0 += ((abcd1 - abcd0) * fracy) >> 7; + return abcd0; +} + +// Optimized version of textureLinearPackedR8 for Y R8 texture. This assumes +// constant Y and returns a duplicate of the result interleaved with itself +// to aid in later YUV transformation. +template +static inline V8 textureLinearRowR8(S sampler, I32 ix, int32_t offsety, + int32_t stridey, int16_t fracy) { + assert(sampler->format == TextureFormat::R8); + + // Calculate X fraction and clamp X offset into range. + I32 fracx = ix; + ix >>= 7; + fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F; + ix = clampCoord(ix, sampler->width - 1); + + // Load the sample taps and combine rows. + auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy); + + // Unzip the result and do final horizontal multiply-add base on X fraction. + auto abcdl = SHUFFLE(abcd, abcd, 0, 0, 2, 2, 4, 4, 6, 6); + auto abcdh = SHUFFLE(abcd, abcd, 1, 1, 3, 3, 5, 5, 7, 7); + abcdl += ((abcdh - abcdl) * CONVERT(fracx, I16).xxyyzzww) >> 7; + + // The final result is the packed values interleaved with a duplicate of + // themselves. + return abcdl; +} + +// Optimized version of textureLinearPackedR8 for paired U/V R8 textures. +// Since the two textures have the same dimensions and stride, the addressing +// math can be shared between both samplers. This also allows a coalesced +// multiply in the final stage by packing both U/V results into a single +// operation. +template +static inline V8 textureLinearRowPairedR8(S sampler, S sampler2, + I32 ix, int32_t offsety, + int32_t stridey, + int16_t fracy) { + assert(sampler->format == TextureFormat::R8 && + sampler2->format == TextureFormat::R8); + assert(sampler->width == sampler2->width && + sampler->height == sampler2->height); + assert(sampler->stride == sampler2->stride); + + // Calculate X fraction and clamp X offset into range. + I32 fracx = ix; + ix >>= 7; + fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F; + ix = clampCoord(ix, sampler->width - 1); + + // Load the sample taps for the first sampler and combine rows. + auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy); + + // Load the sample taps for the second sampler and combine rows. + auto xyzw = linearRowTapsR8(sampler2, ix, offsety, stridey, fracy); + + // We are left with a result vector for each sampler with values for adjacent + // pixels interleaved together in each. We need to unzip these values so that + // we can do the final horizontal multiply-add based on the X fraction. + auto abcdxyzwl = SHUFFLE(abcd, xyzw, 0, 8, 2, 10, 4, 12, 6, 14); + auto abcdxyzwh = SHUFFLE(abcd, xyzw, 1, 9, 3, 11, 5, 13, 7, 15); + abcdxyzwl += ((abcdxyzwh - abcdxyzwl) * CONVERT(fracx, I16).xxyyzzww) >> 7; + + // The final result is the packed values for the first sampler interleaved + // with the packed values for the second sampler. + return abcdxyzwl; +} + +// Casting to int loses some precision while stepping that can offset the +// image, so shift the values by some extra bits of precision to minimize +// this. We support up to 16 bits of image size, 7 bits of quantization, +// and 1 bit for sign, which leaves 8 bits left for extra precision. +const int STEP_BITS = 8; + +// Optimized version of textureLinearPackedR8 for Y R8 texture with +// half-resolution paired U/V R8 textures. This allows us to more efficiently +// pack YUV samples into vectors to substantially reduce math operations even +// further. +template +static inline void upscaleYUV42R8(uint32_t* dest, int span, uint8_t* yRow, + I32 yU, int32_t yDU, int32_t yStrideV, + int16_t yFracV, uint8_t* cRow1, + uint8_t* cRow2, I32 cU, int32_t cDU, + int32_t cStrideV, int16_t cFracV, + const YUVMatrix& colorSpace) { + // As much as possible try to utilize the fact that we're only using half + // the UV samples to combine Y and UV samples into single vectors. Here we + // need to initialize several useful vector quantities for stepping fractional + // offsets. For the UV samples, we take the average of the first+second and + // third+fourth samples in a chunk which conceptually correspond to offsets + // 0.5 and 1.5 (in 0..2 range). This allows us to reconstruct intermediate + // samples 0.25, 0.75, 1.25, and 1.75 later. X fraction is shifted over into + // the top 7 bits of an unsigned short so that we can mask off the exact + // fractional bits we need to blend merely by right shifting them into + // position. + cU = (cU.xzxz + cU.ywyw) >> 1; + auto ycFracX = CONVERT(combine(yU, cU), V8) + << (16 - (STEP_BITS + 7)); + auto ycFracDX = combine(I16(yDU), I16(cDU)) << (16 - (STEP_BITS + 7)); + auto ycFracV = combine(I16(yFracV), I16(cFracV)); + I32 yI = yU >> (STEP_BITS + 7); + I32 cI = cU >> (STEP_BITS + 7); + // Load initial combined YUV samples for each row and blend them. + auto ycSrc0 = + CONVERT(combine(unaligned_load>(&yRow[yI.x]), + combine(unaligned_load>(&cRow1[cI.x]), + unaligned_load>(&cRow2[cI.x]))), + V8); + auto ycSrc1 = CONVERT( + combine(unaligned_load>(&yRow[yI.x + yStrideV]), + combine(unaligned_load>(&cRow1[cI.x + cStrideV]), + unaligned_load>(&cRow2[cI.x + cStrideV]))), + V8); + auto ycSrc = ycSrc0 + (((ycSrc1 - ycSrc0) * ycFracV) >> 7); + + // Here we shift in results from the next sample while caching results from + // the previous sample. This allows us to reduce the multiplications in the + // inner loop down to only two since we just need to blend the new samples + // horizontally and then vertically once each. + for (uint32_t* end = dest + span; dest < end; dest += 4) { + yU += yDU; + I32 yIn = yU >> (STEP_BITS + 7); + cU += cDU; + I32 cIn = cU >> (STEP_BITS + 7); + // Load combined YUV samples for the next chunk on each row and blend them. + auto ycSrc0n = + CONVERT(combine(unaligned_load>(&yRow[yIn.x]), + combine(unaligned_load>(&cRow1[cIn.x]), + unaligned_load>(&cRow2[cIn.x]))), + V8); + auto ycSrc1n = CONVERT( + combine(unaligned_load>(&yRow[yIn.x + yStrideV]), + combine(unaligned_load>(&cRow1[cIn.x + cStrideV]), + unaligned_load>(&cRow2[cIn.x + cStrideV]))), + V8); + auto ycSrcn = ycSrc0n + (((ycSrc1n - ycSrc0n) * ycFracV) >> 7); + + // The source samples for the chunk may not match the actual tap offsets. + // Since we're upscaling, we know the tap offsets fall within all the + // samples in a 4-wide chunk. Since we can't rely on PSHUFB or similar, + // instead we do laborious shuffling here for the Y samples and then the UV + // samples. + auto yshuf = lowHalf(ycSrc); + auto yshufn = + SHUFFLE(yshuf, yIn.x == yI.w ? lowHalf(ycSrcn).yyyy : lowHalf(ycSrcn), + 1, 2, 3, 4); + if (yI.y == yI.x) { + yshuf = yshuf.xxyz; + yshufn = yshufn.xxyz; + } + if (yI.z == yI.y) { + yshuf = yshuf.xyyz; + yshufn = yshufn.xyyz; + } + if (yI.w == yI.z) { + yshuf = yshuf.xyzz; + yshufn = yshufn.xyzz; + } + + auto cshuf = highHalf(ycSrc); + auto cshufn = + SHUFFLE(cshuf, cIn.x == cI.y ? highHalf(ycSrcn).yyww : highHalf(ycSrcn), + 1, 4, 3, 6); + if (cI.y == cI.x) { + cshuf = cshuf.xxzz; + cshufn = cshufn.xxzz; + } + + // After shuffling, combine the Y and UV samples back into a single vector + // for blending. Shift X fraction into position as unsigned to mask off top + // bits and get rid of low bits to avoid multiplication overflow. + auto yuvPx = combine(yshuf, cshuf); + yuvPx += ((combine(yshufn, cshufn) - yuvPx) * + bit_cast>(ycFracX >> (16 - 7))) >> + 7; + + // Cache the new samples as the current samples on the next iteration. + ycSrc = ycSrcn; + ycFracX += ycFracDX; + yI = yIn; + cI = cIn; + + // De-interleave the Y and UV results. We need to average the UV results + // to produce values for intermediate samples. Taps for UV were collected at + // offsets 0.5 and 1.5, such that if we take a quarter of the difference + // (1.5-0.5)/4, subtract it from even samples, and add it to odd samples, + // we can estimate samples 0.25, 0.75, 1.25, and 1.75. + auto yPx = SHUFFLE(yuvPx, yuvPx, 0, 0, 1, 1, 2, 2, 3, 3); + auto uvPx = SHUFFLE(yuvPx, yuvPx, 4, 6, 4, 6, 5, 7, 5, 7) + + ((SHUFFLE(yuvPx, yuvPx, 4, 6, 5, 7, 4, 6, 5, 7) - + SHUFFLE(yuvPx, yuvPx, 5, 7, 4, 6, 5, 7, 4, 6)) >> + 2); + + commit_blend_span(dest, colorSpace.convert(yPx, uvPx)); + } +} + +// This is the inner loop driver of CompositeYUV that processes an axis-aligned +// YUV span, dispatching based on appropriate format and scaling. This is also +// reused by blendYUV to accelerate some cases of texture sampling in the +// shader. +template +static void linear_row_yuv(uint32_t* dest, int span, sampler2DRect samplerY, + const vec2_scalar& srcUV, float srcDU, + sampler2DRect samplerU, sampler2DRect samplerV, + const vec2_scalar& chromaUV, float chromaDU, + int colorDepth, const YUVMatrix& colorSpace) { + // Calculate varying and constant interp data for Y plane. + I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS)); + int32_t yV = int32_t(srcUV.y); + + // Calculate varying and constant interp data for chroma planes. + I32 cU = cast(init_interp(chromaUV.x, chromaDU) * (1 << STEP_BITS)); + int32_t cV = int32_t(chromaUV.y); + + // We need to skip 4 pixels per chunk. + int32_t yDU = int32_t((4 << STEP_BITS) * srcDU); + int32_t cDU = int32_t((4 << STEP_BITS) * chromaDU); + + if (samplerY->width < 2 || samplerU->width < 2) { + // If the source row has less than 2 pixels, it's not safe to use a linear + // filter because it may overread the row. Just convert the single pixel + // with nearest filtering and fill the row with it. + Float yuvF = {texelFetch(samplerY, ivec2(srcUV)).x.x, + texelFetch(samplerU, ivec2(chromaUV)).x.x, + texelFetch(samplerV, ivec2(chromaUV)).x.x, 1.0f}; + // If this is an HDR LSB format, we need to renormalize the result. + if (colorDepth > 8) { + int rescaleFactor = 16 - colorDepth; + yuvF *= float(1 << rescaleFactor); + } + I16 yuv = CONVERT(round_pixel(yuvF), I16); + commit_solid_span( + dest, + unpack(colorSpace.convert(V8(yuv.x), + zip(I16(yuv.y), I16(yuv.z)))), + span); + } else if (samplerY->format == TextureFormat::R16) { + // Sample each YUV plane, rescale it to fit in low 8 bits of word, and + // then transform them by the appropriate color space. + assert(colorDepth > 8); + // Need to right shift the sample by the amount of bits over 8 it + // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit + // of precision at the low end already, hence 1 is subtracted from the + // color depth. + int rescaleBits = (colorDepth - 1) - 8; + for (; span >= 4; span -= 4) { + auto yPx = + textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >> + rescaleBits; + auto uPx = + textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >> + rescaleBits; + auto vPx = + textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >> + rescaleBits; + commit_blend_span( + dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx))); + dest += 4; + yU += yDU; + cU += cDU; + } + if (span > 0) { + // Handle any remaining pixels... + auto yPx = + textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >> + rescaleBits; + auto uPx = + textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >> + rescaleBits; + auto vPx = + textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >> + rescaleBits; + commit_blend_span( + dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)), span); + } + } else { + assert(samplerY->format == TextureFormat::R8); + assert(colorDepth == 8); + + // Calculate varying and constant interp data for Y plane. + int16_t yFracV = yV & 0x7F; + yV >>= 7; + int32_t yOffsetV = clampCoord(yV, samplerY->height) * samplerY->stride; + int32_t yStrideV = + yV >= 0 && yV < int32_t(samplerY->height) - 1 ? samplerY->stride : 0; + + // Calculate varying and constant interp data for chroma planes. + int16_t cFracV = cV & 0x7F; + cV >>= 7; + int32_t cOffsetV = clampCoord(cV, samplerU->height) * samplerU->stride; + int32_t cStrideV = + cV >= 0 && cV < int32_t(samplerU->height) - 1 ? samplerU->stride : 0; + + // If we're sampling the UV planes at half the resolution of the Y plane, + // then try to use half resolution fast-path. + if (yDU >= cDU && cDU > 0 && yDU <= (4 << (STEP_BITS + 7)) && + cDU <= (2 << (STEP_BITS + 7))) { + // Ensure that samples don't fall outside of the valid bounds of each + // planar texture. Step until the initial X coordinates are positive. + for (; (yU.x < 0 || cU.x < 0) && span >= 4; span -= 4) { + auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, + yStrideV, yFracV); + auto uvPx = textureLinearRowPairedR8( + samplerU, samplerV, cU >> STEP_BITS, cOffsetV, cStrideV, cFracV); + commit_blend_span(dest, colorSpace.convert(yPx, uvPx)); + dest += 4; + yU += yDU; + cU += cDU; + } + // Calculate the number of aligned chunks that we can step inside the + // bounds of each planar texture without overreading. + int inside = min( + min((((int(samplerY->width) - 4) << (STEP_BITS + 7)) - yU.x) / yDU, + (((int(samplerU->width) - 4) << (STEP_BITS + 7)) - cU.x) / cDU) * + 4, + span & ~3); + if (inside > 0) { + uint8_t* yRow = (uint8_t*)samplerY->buf + yOffsetV; + uint8_t* cRow1 = (uint8_t*)samplerU->buf + cOffsetV; + uint8_t* cRow2 = (uint8_t*)samplerV->buf + cOffsetV; + upscaleYUV42R8(dest, inside, yRow, yU, yDU, yStrideV, yFracV, + cRow1, cRow2, cU, cDU, cStrideV, cFracV, + colorSpace); + span -= inside; + dest += inside; + yU += (inside / 4) * yDU; + cU += (inside / 4) * cDU; + } + // If there are any remaining chunks that weren't inside, handle them + // below. + } + for (; span >= 4; span -= 4) { + // Sample each YUV plane and then transform them by the appropriate + // color space. + auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, + yStrideV, yFracV); + auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS, + cOffsetV, cStrideV, cFracV); + commit_blend_span(dest, colorSpace.convert(yPx, uvPx)); + dest += 4; + yU += yDU; + cU += cDU; + } + if (span > 0) { + // Handle any remaining pixels... + auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, + yStrideV, yFracV); + auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS, + cOffsetV, cStrideV, cFracV); + commit_blend_span(dest, colorSpace.convert(yPx, uvPx), span); + } + } +} + +static void linear_convert_yuv(Texture& ytex, Texture& utex, Texture& vtex, + const YUVMatrix& rgbFromYcbcr, int colorDepth, + const IntRect& srcReq, Texture& dsttex, + const IntRect& dstReq, bool invertX, + bool invertY, const IntRect& clipRect) { + // Compute valid dest bounds + IntRect dstBounds = dsttex.sample_bounds(dstReq); + dstBounds.intersect(clipRect); + // Check if sampling bounds are empty + if (dstBounds.is_empty()) { + return; + } + // Initialize samplers for source textures + sampler2DRect_impl sampler[3]; + init_sampler(&sampler[0], ytex); + init_sampler(&sampler[1], utex); + init_sampler(&sampler[2], vtex); + + // Compute source UVs + vec2_scalar srcUV(srcReq.x0, srcReq.y0); + vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), + float(srcReq.height()) / dstReq.height()); + if (invertX) { + // Advance to the end of the row and flip the step. + srcUV.x += srcReq.width(); + srcDUV.x = -srcDUV.x; + } + // Inverted Y must step downward along source rows + if (invertY) { + srcUV.y += srcReq.height(); + srcDUV.y = -srcDUV.y; + } + // Skip to clamped source start + srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f); + // Calculate separate chroma UVs for chroma planes with different scale + vec2_scalar chromaScale(float(utex.width) / ytex.width, + float(utex.height) / ytex.height); + vec2_scalar chromaUV = srcUV * chromaScale; + vec2_scalar chromaDUV = srcDUV * chromaScale; + // Scale UVs by lerp precision. If the row has only 1 pixel, then don't + // quantize so that we can use nearest filtering instead to avoid overreads. + if (ytex.width >= 2 && utex.width >= 2) { + srcUV = linearQuantize(srcUV, 128); + srcDUV *= 128.0f; + chromaUV = linearQuantize(chromaUV, 128); + chromaDUV *= 128.0f; + } + // Calculate dest pointer from clamped offsets + int destStride = dsttex.stride(); + char* dest = dsttex.sample_ptr(dstReq, dstBounds); + int span = dstBounds.width(); + for (int rows = dstBounds.height(); rows > 0; rows--) { + linear_row_yuv((uint32_t*)dest, span, &sampler[0], srcUV, srcDUV.x, + &sampler[1], &sampler[2], chromaUV, chromaDUV.x, colorDepth, + rgbFromYcbcr); + dest += destStride; + srcUV.y += srcDUV.y; + chromaUV.y += chromaDUV.y; + } +} + +// - +// This section must match gfx/2d/Types.h + +enum class YUVRangedColorSpace : uint8_t { + BT601_Narrow = 0, + BT601_Full, + BT709_Narrow, + BT709_Full, + BT2020_Narrow, + BT2020_Full, + GbrIdentity, +}; + +// - +// This section must match yuv.glsl + +vec4_scalar get_ycbcr_zeros_ones(const YUVRangedColorSpace color_space, + const GLuint color_depth) { + // For SWGL's 8bpc-only pipeline, our extra care here probably doesn't matter. + // However, technically e.g. 10-bit achromatic zero for cb and cr is + // (128 << 2) / ((1 << 10) - 1) = 512 / 1023, which != 128 / 255, and affects + // our matrix values subtly. Maybe not enough to matter? But it's the most + // correct thing to do. + // Unlike the glsl version, our texture samples are u8([0,255]) not + // u16([0,1023]) though. + switch (color_space) { + case YUVRangedColorSpace::BT601_Narrow: + case YUVRangedColorSpace::BT709_Narrow: + case YUVRangedColorSpace::BT2020_Narrow: { + auto extra_bit_count = color_depth - 8; + vec4_scalar zo = { + float(16 << extra_bit_count), + float(128 << extra_bit_count), + float(235 << extra_bit_count), + float(240 << extra_bit_count), + }; + float all_bits = (1 << color_depth) - 1; + zo /= all_bits; + return zo; + } + + case YUVRangedColorSpace::BT601_Full: + case YUVRangedColorSpace::BT709_Full: + case YUVRangedColorSpace::BT2020_Full: { + const auto narrow = + get_ycbcr_zeros_ones(YUVRangedColorSpace::BT601_Narrow, color_depth); + return {0.0, narrow.y, 1.0, 1.0}; + } + + case YUVRangedColorSpace::GbrIdentity: + break; + } + return {0.0, 0.0, 1.0, 1.0}; +} + +constexpr mat3_scalar RgbFromYuv_Rec601 = { + {1.00000, 1.00000, 1.00000}, + {0.00000, -0.17207, 0.88600}, + {0.70100, -0.35707, 0.00000}, +}; +constexpr mat3_scalar RgbFromYuv_Rec709 = { + {1.00000, 1.00000, 1.00000}, + {0.00000, -0.09366, 0.92780}, + {0.78740, -0.23406, 0.00000}, +}; +constexpr mat3_scalar RgbFromYuv_Rec2020 = { + {1.00000, 1.00000, 1.00000}, + {0.00000, -0.08228, 0.94070}, + {0.73730, -0.28568, 0.00000}, +}; +constexpr mat3_scalar RgbFromYuv_GbrIdentity = { + {0, 1, 0}, + {0, 0, 1}, + {1, 0, 0}, +}; + +inline mat3_scalar get_rgb_from_yuv(const YUVRangedColorSpace color_space) { + switch (color_space) { + case YUVRangedColorSpace::BT601_Narrow: + case YUVRangedColorSpace::BT601_Full: + return RgbFromYuv_Rec601; + case YUVRangedColorSpace::BT709_Narrow: + case YUVRangedColorSpace::BT709_Full: + return RgbFromYuv_Rec709; + case YUVRangedColorSpace::BT2020_Narrow: + case YUVRangedColorSpace::BT2020_Full: + return RgbFromYuv_Rec2020; + case YUVRangedColorSpace::GbrIdentity: + break; + } + return RgbFromYuv_GbrIdentity; +} + +struct YcbcrInfo final { + vec3_scalar ycbcr_bias; + mat3_scalar rgb_from_debiased_ycbcr; +}; + +inline YcbcrInfo get_ycbcr_info(const YUVRangedColorSpace color_space, + GLuint color_depth) { + // SWGL always does 8bpc math, so don't scale the matrix for 10bpc! + color_depth = 8; + + const auto zeros_ones = get_ycbcr_zeros_ones(color_space, color_depth); + const auto zeros = vec2_scalar{zeros_ones.x, zeros_ones.y}; + const auto ones = vec2_scalar{zeros_ones.z, zeros_ones.w}; + const auto scale = 1.0f / (ones - zeros); + + const auto rgb_from_yuv = get_rgb_from_yuv(color_space); + const mat3_scalar yuv_from_debiased_ycbcr = { + {scale.x, 0, 0}, + {0, scale.y, 0}, + {0, 0, scale.y}, + }; + + YcbcrInfo ret; + ret.ycbcr_bias = {zeros.x, zeros.y, zeros.y}; + ret.rgb_from_debiased_ycbcr = rgb_from_yuv * yuv_from_debiased_ycbcr; + return ret; +} + +// - + +extern "C" { + +// Extension for compositing a YUV surface represented by separate YUV planes +// to a BGRA destination. The supplied color space is used to determine the +// transform from YUV to BGRA after sampling. +void CompositeYUV(LockedTexture* lockedDst, LockedTexture* lockedY, + LockedTexture* lockedU, LockedTexture* lockedV, + YUVRangedColorSpace colorSpace, GLuint colorDepth, GLint srcX, + GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX, + GLint dstY, GLsizei dstWidth, GLsizei dstHeight, + GLboolean flipX, GLboolean flipY, GLint clipX, GLint clipY, + GLsizei clipWidth, GLsizei clipHeight) { + if (!lockedDst || !lockedY || !lockedU || !lockedV) { + return; + } + if (colorSpace > YUVRangedColorSpace::GbrIdentity) { + assert(false); + return; + } + const auto ycbcrInfo = get_ycbcr_info(colorSpace, colorDepth); + const auto rgbFromYcbcr = + YUVMatrix::From(ycbcrInfo.ycbcr_bias, ycbcrInfo.rgb_from_debiased_ycbcr); + + Texture& ytex = *lockedY; + Texture& utex = *lockedU; + Texture& vtex = *lockedV; + Texture& dsttex = *lockedDst; + // All YUV planes must currently be represented by R8 or R16 textures. + // The chroma (U/V) planes must have matching dimensions. + assert(ytex.bpp() == utex.bpp() && ytex.bpp() == vtex.bpp()); + assert((ytex.bpp() == 1 && colorDepth == 8) || + (ytex.bpp() == 2 && colorDepth > 8)); + // assert(ytex.width == utex.width && ytex.height == utex.height); + assert(utex.width == vtex.width && utex.height == vtex.height); + assert(ytex.offset == utex.offset && ytex.offset == vtex.offset); + assert(dsttex.bpp() == 4); + + IntRect srcReq = + IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - ytex.offset; + IntRect dstReq = + IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset; + if (srcReq.is_empty() || dstReq.is_empty()) { + return; + } + + // Compute clip rect as relative to the dstReq, as that's the same coords + // as used for the sampling bounds. + IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth, + clipY - dstY + clipHeight}; + // For now, always use a linear filter path that would be required for + // scaling. Further fast-paths for non-scaled video might be desirable in the + // future. + linear_convert_yuv(ytex, utex, vtex, rgbFromYcbcr, colorDepth, srcReq, dsttex, + dstReq, flipX, flipY, clipRect); +} + +} // extern "C" diff --git a/gfx/wr/swgl/src/gl.cc b/gfx/wr/swgl/src/gl.cc new file mode 100644 index 0000000000..0b09e29008 --- /dev/null +++ b/gfx/wr/swgl/src/gl.cc @@ -0,0 +1,2851 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include +#include +#include +#include +#include +#include + +#ifdef __MACH__ +# include +# include +#else +# include +#endif + +#ifdef NDEBUG +# define debugf(...) +#else +# define debugf(...) printf(__VA_ARGS__) +#endif + +// #define PRINT_TIMINGS + +#ifdef _WIN32 +# define ALWAYS_INLINE __forceinline +# define NO_INLINE __declspec(noinline) + +// Including Windows.h brings a huge amount of namespace polution so just +// define a couple of things manually +typedef int BOOL; +# define WINAPI __stdcall +# define DECLSPEC_IMPORT __declspec(dllimport) +# define WINBASEAPI DECLSPEC_IMPORT +typedef unsigned long DWORD; +typedef long LONG; +typedef __int64 LONGLONG; +# define DUMMYSTRUCTNAME + +typedef union _LARGE_INTEGER { + struct { + DWORD LowPart; + LONG HighPart; + } DUMMYSTRUCTNAME; + struct { + DWORD LowPart; + LONG HighPart; + } u; + LONGLONG QuadPart; +} LARGE_INTEGER; +extern "C" { +WINBASEAPI BOOL WINAPI +QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount); + +WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency); +} + +#else +// GCC is slower when dealing with always_inline, especially in debug builds. +// When using Clang, use always_inline more aggressively. +# if defined(__clang__) || defined(NDEBUG) +# define ALWAYS_INLINE __attribute__((always_inline)) inline +# else +# define ALWAYS_INLINE inline +# endif +# define NO_INLINE __attribute__((noinline)) +#endif + +// Some functions may cause excessive binary bloat if inlined in debug or with +// GCC builds, so use PREFER_INLINE on these instead of ALWAYS_INLINE. +#if defined(__clang__) && defined(NDEBUG) +# define PREFER_INLINE ALWAYS_INLINE +#else +# define PREFER_INLINE inline +#endif + +#define UNREACHABLE __builtin_unreachable() + +#define UNUSED [[maybe_unused]] + +#define FALLTHROUGH [[fallthrough]] + +#if defined(MOZILLA_CLIENT) && defined(MOZ_CLANG_PLUGIN) +# define IMPLICIT __attribute__((annotate("moz_implicit"))) +#else +# define IMPLICIT +#endif + +#include "gl_defs.h" +#include "glsl.h" +#include "program.h" +#include "texture.h" + +using namespace glsl; + +typedef ivec2_scalar IntPoint; + +struct IntRect { + int x0; + int y0; + int x1; + int y1; + + IntRect() : x0(0), y0(0), x1(0), y1(0) {} + IntRect(int x0, int y0, int x1, int y1) : x0(x0), y0(y0), x1(x1), y1(y1) {} + IntRect(IntPoint origin, IntPoint size) + : x0(origin.x), + y0(origin.y), + x1(origin.x + size.x), + y1(origin.y + size.y) {} + + int width() const { return x1 - x0; } + int height() const { return y1 - y0; } + bool is_empty() const { return width() <= 0 || height() <= 0; } + + IntPoint origin() const { return IntPoint(x0, y0); } + + bool same_size(const IntRect& o) const { + return width() == o.width() && height() == o.height(); + } + + bool contains(const IntRect& o) const { + return o.x0 >= x0 && o.y0 >= y0 && o.x1 <= x1 && o.y1 <= y1; + } + + IntRect& intersect(const IntRect& o) { + x0 = max(x0, o.x0); + y0 = max(y0, o.y0); + x1 = min(x1, o.x1); + y1 = min(y1, o.y1); + return *this; + } + + IntRect intersection(const IntRect& o) { + IntRect result = *this; + result.intersect(o); + return result; + } + + // Scale from source-space to dest-space, optionally rounding inward + IntRect& scale(int srcWidth, int srcHeight, int dstWidth, int dstHeight, + bool roundIn = false) { + x0 = (x0 * dstWidth + (roundIn ? srcWidth - 1 : 0)) / srcWidth; + y0 = (y0 * dstHeight + (roundIn ? srcHeight - 1 : 0)) / srcHeight; + x1 = (x1 * dstWidth) / srcWidth; + y1 = (y1 * dstHeight) / srcHeight; + return *this; + } + + // Flip the rect's Y coords around inflection point at Y=offset + void invert_y(int offset) { + y0 = offset - y0; + y1 = offset - y1; + swap(y0, y1); + } + + IntRect& offset(const IntPoint& o) { + x0 += o.x; + y0 += o.y; + x1 += o.x; + y1 += o.y; + return *this; + } + + IntRect operator+(const IntPoint& o) const { + return IntRect(*this).offset(o); + } + IntRect operator-(const IntPoint& o) const { + return IntRect(*this).offset(-o); + } +}; + +typedef vec2_scalar Point2D; +typedef vec4_scalar Point3D; + +struct IntRange { + int start; + int end; + + int len() const { return end - start; } + + IntRange intersect(IntRange r) const { + return {max(start, r.start), min(end, r.end)}; + } +}; + +struct FloatRange { + float start; + float end; + + float clip(float x) const { return clamp(x, start, end); } + + FloatRange clip(FloatRange r) const { return {clip(r.start), clip(r.end)}; } + + FloatRange merge(FloatRange r) const { + return {min(start, r.start), max(end, r.end)}; + } + + IntRange round() const { + return {int(floor(start + 0.5f)), int(floor(end + 0.5f))}; + } + + IntRange round_out() const { return {int(floor(start)), int(ceil(end))}; } +}; + +template +static inline FloatRange x_range(P p0, P p1) { + return {min(p0.x, p1.x), max(p0.x, p1.x)}; +} + +struct VertexAttrib { + size_t size = 0; // in bytes + GLenum type = 0; + bool normalized = false; + GLsizei stride = 0; + GLuint offset = 0; + bool enabled = false; + GLuint divisor = 0; + int vertex_array = 0; + int vertex_buffer = 0; + char* buf = nullptr; // XXX: this can easily dangle + size_t buf_size = 0; // this will let us bounds check + + // Mark the buffer as invalid so we don't accidentally use stale data. + void disable() { + enabled = false; + buf = nullptr; + buf_size = 0; + } +}; + +static int bytes_for_internal_format(GLenum internal_format) { + switch (internal_format) { + case GL_RGBA32F: + return 4 * 4; + case GL_RGBA32I: + return 4 * 4; + case GL_RGBA8: + case GL_BGRA8: + case GL_RGBA: + return 4; + case GL_R8: + case GL_RED: + return 1; + case GL_RG8: + case GL_RG: + return 2; + case GL_DEPTH_COMPONENT: + case GL_DEPTH_COMPONENT16: + case GL_DEPTH_COMPONENT24: + case GL_DEPTH_COMPONENT32: + return 4; + case GL_RGB_RAW_422_APPLE: + return 2; + case GL_R16: + return 2; + case GL_RG16: + return 4; + default: + debugf("internal format: %x\n", internal_format); + assert(0); + return 0; + } +} + +static inline int aligned_stride(int row_bytes) { return (row_bytes + 3) & ~3; } + +static TextureFormat gl_format_to_texture_format(int type) { + switch (type) { + case GL_RGBA32F: + return TextureFormat::RGBA32F; + case GL_RGBA32I: + return TextureFormat::RGBA32I; + case GL_RGBA8: + return TextureFormat::RGBA8; + case GL_R8: + return TextureFormat::R8; + case GL_RG8: + return TextureFormat::RG8; + case GL_R16: + return TextureFormat::R16; + case GL_RG16: + return TextureFormat::RG16; + case GL_RGB_RAW_422_APPLE: + return TextureFormat::YUV422; + default: + assert(0); + return TextureFormat::RGBA8; + } +} + +struct Query { + uint64_t value = 0; +}; + +struct Buffer { + char* buf = nullptr; + size_t size = 0; + size_t capacity = 0; + + // Returns true if re-allocation succeeded, false otherwise... + bool allocate(size_t new_size) { + // If the size remains unchanged, don't allocate anything. + if (new_size == size) { + return true; + } + // If the new size is within the existing capacity of the buffer, just + // reuse the existing buffer. + if (new_size <= capacity) { + size = new_size; + return true; + } + // Otherwise we need to reallocate the buffer to hold up to the requested + // larger size. + char* new_buf = (char*)realloc(buf, new_size); + assert(new_buf); + if (!new_buf) { + // If we fail, null out the buffer rather than leave around the old + // allocation state. + cleanup(); + return false; + } + // The reallocation succeeded, so install the buffer. + buf = new_buf; + size = new_size; + capacity = new_size; + return true; + } + + void cleanup() { + if (buf) { + free(buf); + buf = nullptr; + size = 0; + capacity = 0; + } + } + + ~Buffer() { cleanup(); } +}; + +struct Framebuffer { + GLuint color_attachment = 0; + GLuint depth_attachment = 0; +}; + +struct Renderbuffer { + GLuint texture = 0; + + void on_erase(); +}; + +TextureFilter gl_filter_to_texture_filter(int type) { + switch (type) { + case GL_NEAREST: + return TextureFilter::NEAREST; + case GL_NEAREST_MIPMAP_LINEAR: + return TextureFilter::NEAREST; + case GL_NEAREST_MIPMAP_NEAREST: + return TextureFilter::NEAREST; + case GL_LINEAR: + return TextureFilter::LINEAR; + case GL_LINEAR_MIPMAP_LINEAR: + return TextureFilter::LINEAR; + case GL_LINEAR_MIPMAP_NEAREST: + return TextureFilter::LINEAR; + default: + assert(0); + return TextureFilter::NEAREST; + } +} + +struct Texture { + GLenum internal_format = 0; + int width = 0; + int height = 0; + char* buf = nullptr; + size_t buf_size = 0; + uint32_t buf_stride = 0; + uint8_t buf_bpp = 0; + GLenum min_filter = GL_NEAREST; + GLenum mag_filter = GL_LINEAR; + // The number of active locks on this texture. If this texture has any active + // locks, we need to disallow modifying or destroying the texture as it may + // be accessed by other threads where modifications could lead to races. + int32_t locked = 0; + // When used as an attachment of a framebuffer, rendering to the texture + // behaves as if it is located at the given offset such that the offset is + // subtracted from all transformed vertexes after the viewport is applied. + IntPoint offset; + + enum FLAGS { + // If the buffer is internally-allocated by SWGL + SHOULD_FREE = 1 << 1, + // If the buffer has been cleared to initialize it. Currently this is only + // utilized by depth buffers which need to know when depth runs have reset + // to a valid row state. When unset, the depth runs may contain garbage. + CLEARED = 1 << 2, + }; + int flags = SHOULD_FREE; + bool should_free() const { return bool(flags & SHOULD_FREE); } + bool cleared() const { return bool(flags & CLEARED); } + + void set_flag(int flag, bool val) { + if (val) { + flags |= flag; + } else { + flags &= ~flag; + } + } + void set_should_free(bool val) { + // buf must be null before SHOULD_FREE can be safely toggled. Otherwise, we + // might accidentally mistakenly realloc an externally allocated buffer as + // if it were an internally allocated one. + assert(!buf); + set_flag(SHOULD_FREE, val); + } + void set_cleared(bool val) { set_flag(CLEARED, val); } + + // Delayed-clearing state. When a clear of an FB is requested, we don't + // immediately clear each row, as the rows may be subsequently overwritten + // by draw calls, allowing us to skip the work of clearing the affected rows + // either fully or partially. Instead, we keep a bit vector of rows that need + // to be cleared later and save the value they need to be cleared with so + // that we can clear these rows individually when they are touched by draws. + // This currently only works for 2D textures, but not on texture arrays. + int delay_clear = 0; + uint32_t clear_val = 0; + uint32_t* cleared_rows = nullptr; + + void init_depth_runs(uint32_t z); + void fill_depth_runs(uint32_t z, const IntRect& scissor); + + void enable_delayed_clear(uint32_t val) { + delay_clear = height; + clear_val = val; + if (!cleared_rows) { + cleared_rows = new uint32_t[(height + 31) / 32]; + } + memset(cleared_rows, 0, ((height + 31) / 32) * sizeof(uint32_t)); + if (height & 31) { + cleared_rows[height / 32] = ~0U << (height & 31); + } + } + + void disable_delayed_clear() { + if (cleared_rows) { + delete[] cleared_rows; + cleared_rows = nullptr; + delay_clear = 0; + } + } + + int bpp() const { return buf_bpp; } + int compute_bpp() const { return bytes_for_internal_format(internal_format); } + + size_t stride() const { return buf_stride; } + size_t compute_stride(int bpp, int width) const { + return aligned_stride(bpp * width); + } + + // Set an external backing buffer of this texture. + void set_buffer(void* new_buf, size_t new_stride) { + assert(!should_free()); + // Ensure that the supplied stride is at least as big as the row data and + // is aligned to the smaller of either the BPP or word-size. We need to at + // least be able to sample data from within a row and sample whole pixels + // of smaller formats without risking unaligned access. + int new_bpp = compute_bpp(); + assert(new_stride >= size_t(new_bpp * width) && + new_stride % min(new_bpp, sizeof(uint32_t)) == 0); + + buf = (char*)new_buf; + buf_size = 0; + buf_bpp = new_bpp; + buf_stride = new_stride; + } + + // Returns true if re-allocation succeeded, false otherwise... + bool allocate(bool force = false, int min_width = 0, int min_height = 0) { + assert(!locked); // Locked textures shouldn't be reallocated + // If we get here, some GL API call that invalidates the texture was used. + // Mark the buffer as not-cleared to signal this. + set_cleared(false); + // Check if there is either no buffer currently or if we forced validation + // of the buffer size because some dimension might have changed. + if ((!buf || force) && should_free()) { + // Compute the buffer's BPP and stride, since they may have changed. + int new_bpp = compute_bpp(); + size_t new_stride = compute_stride(new_bpp, width); + // Compute new size based on the maximum potential stride, rather than + // the current stride, to hopefully avoid reallocations when size would + // otherwise change too much... + size_t max_stride = compute_stride(new_bpp, max(width, min_width)); + size_t size = max_stride * max(height, min_height); + if ((!buf && size > 0) || size > buf_size) { + // Allocate with a SIMD register-sized tail of padding at the end so we + // can safely read or write past the end of the texture with SIMD ops. + // Currently only the flat Z-buffer texture needs this padding due to + // full-register loads and stores in check_depth and discard_depth. In + // case some code in the future accidentally uses a linear filter on a + // texture with less than 2 pixels per row, we also add this padding + // just to be safe. All other texture types and use-cases should be + // safe to omit padding. + size_t padding = + internal_format == GL_DEPTH_COMPONENT24 || max(width, min_width) < 2 + ? sizeof(Float) + : 0; + char* new_buf = (char*)realloc(buf, size + padding); + assert(new_buf); + if (!new_buf) { + // Allocation failed, so ensure we don't leave stale buffer state. + cleanup(); + return false; + } + // Successfully reallocated the buffer, so go ahead and set it. + buf = new_buf; + buf_size = size; + } + // Set the BPP and stride in case they changed. + buf_bpp = new_bpp; + buf_stride = new_stride; + } + // Allocation succeeded or nothing changed... + return true; + } + + void cleanup() { + assert(!locked); // Locked textures shouldn't be destroyed + if (buf) { + // If we need to toggle SHOULD_FREE state, ensure that buf is nulled out, + // regardless of whether we internally allocated it. This will prevent us + // from wrongly treating buf as having been internally allocated for when + // we go to realloc if it actually was externally allocted. + if (should_free()) { + free(buf); + } + buf = nullptr; + buf_size = 0; + buf_bpp = 0; + buf_stride = 0; + } + disable_delayed_clear(); + } + + ~Texture() { cleanup(); } + + IntRect bounds() const { return IntRect{0, 0, width, height}; } + IntRect offset_bounds() const { return bounds() + offset; } + + // Find the valid sampling bounds relative to the requested region + IntRect sample_bounds(const IntRect& req, bool invertY = false) const { + IntRect bb = bounds().intersect(req) - req.origin(); + if (invertY) bb.invert_y(req.height()); + return bb; + } + + // Get a pointer for sampling at the given offset + char* sample_ptr(int x, int y) const { + return buf + y * stride() + x * bpp(); + } + + // Get a pointer for sampling the requested region and limit to the provided + // sampling bounds + char* sample_ptr(const IntRect& req, const IntRect& bounds, + bool invertY = false) const { + // Offset the sample pointer by the clamped bounds + int x = req.x0 + bounds.x0; + // Invert the Y offset if necessary + int y = invertY ? req.y1 - 1 - bounds.y0 : req.y0 + bounds.y0; + return sample_ptr(x, y); + } +}; + +// The last vertex attribute is reserved as a null attribute in case a vertex +// attribute is used without being set. +#define MAX_ATTRIBS 17 +#define NULL_ATTRIB 16 +struct VertexArray { + VertexAttrib attribs[MAX_ATTRIBS]; + int max_attrib = -1; + // The GL spec defines element array buffer binding to be part of VAO state. + GLuint element_array_buffer_binding = 0; + + void validate(); +}; + +struct Shader { + GLenum type = 0; + ProgramLoader loader = nullptr; +}; + +struct Program { + ProgramImpl* impl = nullptr; + VertexShaderImpl* vert_impl = nullptr; + FragmentShaderImpl* frag_impl = nullptr; + bool deleted = false; + + ~Program() { delete impl; } +}; + +// clang-format off +// Fully-expand GL defines while ignoring more than 4 suffixes +#define CONCAT_KEY(prefix, x, y, z, w, ...) prefix##x##y##z##w +// Generate a blend key enum symbol +#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0, 0) +#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0, 0) +#define AA_BLEND_KEY(...) CONCAT_KEY(AA_BLEND_, __VA_ARGS__, 0, 0, 0) +#define AA_MASK_BLEND_KEY(...) CONCAT_KEY(AA_MASK_BLEND_, __VA_ARGS__, 0, 0, 0) + +// Utility macro to easily generate similar code for all implemented blend modes +#define FOR_EACH_BLEND_KEY(macro) \ + macro(GL_ONE, GL_ZERO, 0, 0) \ + macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ + macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \ + macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0) \ + macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE) \ + macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \ + macro(GL_ZERO, GL_SRC_COLOR, 0, 0) \ + macro(GL_ONE, GL_ONE, 0, 0) \ + macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ + macro(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \ + macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, 0, 0) \ + macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0) \ + macro(GL_MIN, 0, 0, 0) \ + macro(GL_MAX, 0, 0, 0) \ + macro(GL_MULTIPLY_KHR, 0, 0, 0) \ + macro(GL_SCREEN_KHR, 0, 0, 0) \ + macro(GL_OVERLAY_KHR, 0, 0, 0) \ + macro(GL_DARKEN_KHR, 0, 0, 0) \ + macro(GL_LIGHTEN_KHR, 0, 0, 0) \ + macro(GL_COLORDODGE_KHR, 0, 0, 0) \ + macro(GL_COLORBURN_KHR, 0, 0, 0) \ + macro(GL_HARDLIGHT_KHR, 0, 0, 0) \ + macro(GL_SOFTLIGHT_KHR, 0, 0, 0) \ + macro(GL_DIFFERENCE_KHR, 0, 0, 0) \ + macro(GL_EXCLUSION_KHR, 0, 0, 0) \ + macro(GL_HSL_HUE_KHR, 0, 0, 0) \ + macro(GL_HSL_SATURATION_KHR, 0, 0, 0) \ + macro(GL_HSL_COLOR_KHR, 0, 0, 0) \ + macro(GL_HSL_LUMINOSITY_KHR, 0, 0, 0) \ + macro(SWGL_BLEND_DROP_SHADOW, 0, 0, 0) \ + macro(SWGL_BLEND_SUBPIXEL_TEXT, 0, 0, 0) + +#define DEFINE_BLEND_KEY(...) BLEND_KEY(__VA_ARGS__), +#define DEFINE_MASK_BLEND_KEY(...) MASK_BLEND_KEY(__VA_ARGS__), +#define DEFINE_AA_BLEND_KEY(...) AA_BLEND_KEY(__VA_ARGS__), +#define DEFINE_AA_MASK_BLEND_KEY(...) AA_MASK_BLEND_KEY(__VA_ARGS__), +enum BlendKey : uint8_t { + FOR_EACH_BLEND_KEY(DEFINE_BLEND_KEY) + FOR_EACH_BLEND_KEY(DEFINE_MASK_BLEND_KEY) + FOR_EACH_BLEND_KEY(DEFINE_AA_BLEND_KEY) + FOR_EACH_BLEND_KEY(DEFINE_AA_MASK_BLEND_KEY) + BLEND_KEY_NONE = BLEND_KEY(GL_ONE, GL_ZERO), + MASK_BLEND_KEY_NONE = MASK_BLEND_KEY(GL_ONE, GL_ZERO), + AA_BLEND_KEY_NONE = AA_BLEND_KEY(GL_ONE, GL_ZERO), + AA_MASK_BLEND_KEY_NONE = AA_MASK_BLEND_KEY(GL_ONE, GL_ZERO), +}; +// clang-format on + +const size_t MAX_TEXTURE_UNITS = 16; + +template +static inline bool unlink(T& binding, T n) { + if (binding == n) { + binding = 0; + return true; + } + return false; +} + +template +struct ObjectStore { + O** objects = nullptr; + size_t size = 0; + // reserve object 0 as null + size_t first_free = 1; + O invalid; + + ~ObjectStore() { + if (objects) { + for (size_t i = 0; i < size; i++) delete objects[i]; + free(objects); + } + } + + bool grow(size_t i) { + size_t new_size = size ? size : 8; + while (new_size <= i) new_size += new_size / 2; + O** new_objects = (O**)realloc(objects, new_size * sizeof(O*)); + assert(new_objects); + if (!new_objects) return false; + while (size < new_size) new_objects[size++] = nullptr; + objects = new_objects; + return true; + } + + void insert(size_t i, const O& o) { + if (i >= size && !grow(i)) return; + if (!objects[i]) objects[i] = new O(o); + } + + size_t next_free() { + size_t i = first_free; + while (i < size && objects[i]) i++; + first_free = i; + return i; + } + + size_t insert(const O& o = O()) { + size_t i = next_free(); + insert(i, o); + return i; + } + + O& operator[](size_t i) { + insert(i, O()); + return i < size ? *objects[i] : invalid; + } + + O* find(size_t i) const { return i < size ? objects[i] : nullptr; } + + template + void on_erase(T*, ...) {} + template + void on_erase(T* o, decltype(&T::on_erase)) { + o->on_erase(); + } + + bool erase(size_t i) { + if (i < size && objects[i]) { + on_erase(objects[i], nullptr); + delete objects[i]; + objects[i] = nullptr; + if (i < first_free) first_free = i; + return true; + } + return false; + } + + O** begin() const { return objects; } + O** end() const { return &objects[size]; } +}; + +struct Context { + int32_t references = 1; + + ObjectStore queries; + ObjectStore buffers; + ObjectStore textures; + ObjectStore vertex_arrays; + ObjectStore framebuffers; + ObjectStore renderbuffers; + ObjectStore shaders; + ObjectStore programs; + + GLenum last_error = GL_NO_ERROR; + + IntRect viewport = {0, 0, 0, 0}; + + bool blend = false; + GLenum blendfunc_srgb = GL_ONE; + GLenum blendfunc_drgb = GL_ZERO; + GLenum blendfunc_sa = GL_ONE; + GLenum blendfunc_da = GL_ZERO; + GLenum blend_equation = GL_FUNC_ADD; + V8 blendcolor = 0; + BlendKey blend_key = BLEND_KEY_NONE; + + bool depthtest = false; + bool depthmask = true; + GLenum depthfunc = GL_LESS; + + bool scissortest = false; + IntRect scissor = {0, 0, 0, 0}; + + GLfloat clearcolor[4] = {0, 0, 0, 0}; + GLdouble cleardepth = 1; + + int unpack_row_length = 0; + + int shaded_rows = 0; + int shaded_pixels = 0; + + struct TextureUnit { + GLuint texture_2d_binding = 0; + GLuint texture_rectangle_binding = 0; + + void unlink(GLuint n) { + ::unlink(texture_2d_binding, n); + ::unlink(texture_rectangle_binding, n); + } + }; + TextureUnit texture_units[MAX_TEXTURE_UNITS]; + int active_texture_unit = 0; + + GLuint current_program = 0; + + GLuint current_vertex_array = 0; + bool validate_vertex_array = true; + + GLuint pixel_pack_buffer_binding = 0; + GLuint pixel_unpack_buffer_binding = 0; + GLuint array_buffer_binding = 0; + GLuint time_elapsed_query = 0; + GLuint samples_passed_query = 0; + GLuint renderbuffer_binding = 0; + GLuint draw_framebuffer_binding = 0; + GLuint read_framebuffer_binding = 0; + GLuint unknown_binding = 0; + + GLuint& get_binding(GLenum name) { + switch (name) { + case GL_PIXEL_PACK_BUFFER: + return pixel_pack_buffer_binding; + case GL_PIXEL_UNPACK_BUFFER: + return pixel_unpack_buffer_binding; + case GL_ARRAY_BUFFER: + return array_buffer_binding; + case GL_ELEMENT_ARRAY_BUFFER: + return vertex_arrays[current_vertex_array].element_array_buffer_binding; + case GL_TEXTURE_2D: + return texture_units[active_texture_unit].texture_2d_binding; + case GL_TEXTURE_RECTANGLE: + return texture_units[active_texture_unit].texture_rectangle_binding; + case GL_TIME_ELAPSED: + return time_elapsed_query; + case GL_SAMPLES_PASSED: + return samples_passed_query; + case GL_RENDERBUFFER: + return renderbuffer_binding; + case GL_DRAW_FRAMEBUFFER: + return draw_framebuffer_binding; + case GL_READ_FRAMEBUFFER: + return read_framebuffer_binding; + default: + debugf("unknown binding %x\n", name); + assert(false); + return unknown_binding; + } + } + + Texture& get_texture(sampler2D, int unit) { + return textures[texture_units[unit].texture_2d_binding]; + } + + Texture& get_texture(isampler2D, int unit) { + return textures[texture_units[unit].texture_2d_binding]; + } + + Texture& get_texture(sampler2DRect, int unit) { + return textures[texture_units[unit].texture_rectangle_binding]; + } + + IntRect apply_scissor(IntRect bb, + const IntPoint& origin = IntPoint(0, 0)) const { + return scissortest ? bb.intersect(scissor - origin) : bb; + } + + IntRect apply_scissor(const Texture& t) const { + return apply_scissor(t.bounds(), t.offset); + } +}; +static Context* ctx = nullptr; +static VertexShaderImpl* vertex_shader = nullptr; +static FragmentShaderImpl* fragment_shader = nullptr; +static BlendKey blend_key = BLEND_KEY_NONE; + +static void prepare_texture(Texture& t, const IntRect* skip = nullptr); + +template +static inline void init_filter(S* s, Texture& t) { + // If the width is not at least 2 pixels, then we can't safely sample the end + // of the row with a linear filter. In that case, just punt to using nearest + // filtering instead. + s->filter = t.width >= 2 ? gl_filter_to_texture_filter(t.mag_filter) + : TextureFilter::NEAREST; +} + +template +static inline void init_sampler(S* s, Texture& t) { + prepare_texture(t); + s->width = t.width; + s->height = t.height; + s->stride = t.stride(); + int bpp = t.bpp(); + if (bpp >= 4) + s->stride /= 4; + else if (bpp == 2) + s->stride /= 2; + else + assert(bpp == 1); + // Use uint32_t* for easier sampling, but need to cast to uint8_t* or + // uint16_t* for formats with bpp < 4. + s->buf = (uint32_t*)t.buf; + s->format = gl_format_to_texture_format(t.internal_format); +} + +template +static inline void null_sampler(S* s) { + // For null texture data, just make the sampler provide a 1x1 buffer that is + // transparent black. Ensure buffer holds at least a SIMD vector of zero data + // for SIMD padding of unaligned loads. + static const uint32_t zeroBuf[sizeof(Float) / sizeof(uint32_t)] = {0}; + s->width = 1; + s->height = 1; + s->stride = s->width; + s->buf = (uint32_t*)zeroBuf; + s->format = TextureFormat::RGBA8; +} + +template +static inline void null_filter(S* s) { + s->filter = TextureFilter::NEAREST; +} + +template +S* lookup_sampler(S* s, int texture) { + Texture& t = ctx->get_texture(s, texture); + if (!t.buf) { + null_sampler(s); + null_filter(s); + } else { + init_sampler(s, t); + init_filter(s, t); + } + return s; +} + +template +S* lookup_isampler(S* s, int texture) { + Texture& t = ctx->get_texture(s, texture); + if (!t.buf) { + null_sampler(s); + } else { + init_sampler(s, t); + } + return s; +} + +int bytes_per_type(GLenum type) { + switch (type) { + case GL_INT: + return 4; + case GL_FLOAT: + return 4; + case GL_UNSIGNED_SHORT: + return 2; + case GL_UNSIGNED_BYTE: + return 1; + default: + assert(0); + return 0; + } +} + +template +static inline S expand_attrib(const char* buf, size_t size, bool normalized) { + typedef typename ElementType::ty elem_type; + S scalar = {0}; + const C* src = reinterpret_cast(buf); + if (normalized) { + const float scale = 1.0f / ((1 << (8 * sizeof(C))) - 1); + for (size_t i = 0; i < size / sizeof(C); i++) { + put_nth_component(scalar, i, elem_type(src[i]) * scale); + } + } else { + for (size_t i = 0; i < size / sizeof(C); i++) { + put_nth_component(scalar, i, elem_type(src[i])); + } + } + return scalar; +} + +template +static inline S load_attrib_scalar(VertexAttrib& va, const char* src) { + if (sizeof(S) <= va.size) { + return *reinterpret_cast(src); + } + if (va.type == GL_UNSIGNED_SHORT) { + return expand_attrib(src, va.size, va.normalized); + } + if (va.type == GL_UNSIGNED_BYTE) { + return expand_attrib(src, va.size, va.normalized); + } + assert(sizeof(typename ElementType::ty) == bytes_per_type(va.type)); + S scalar = {0}; + memcpy(&scalar, src, va.size); + return scalar; +} + +template +void load_attrib(T& attrib, VertexAttrib& va, uint32_t start, int instance, + int count) { + typedef decltype(force_scalar(attrib)) scalar_type; + // If no buffer is available, just use a zero default. + if (!va.buf_size) { + attrib = T(scalar_type{0}); + } else if (va.divisor != 0) { + char* src = (char*)va.buf + va.stride * instance + va.offset; + assert(src + va.size <= va.buf + va.buf_size); + attrib = T(load_attrib_scalar(va, src)); + } else { + // Specialized for WR's primitive vertex order/winding. + if (!count) return; + assert(count >= 2 && count <= 4); + char* src = (char*)va.buf + va.stride * start + va.offset; + switch (count) { + case 2: { + // Lines must be indexed at offsets 0, 1. + // Line vertexes fill vertex shader SIMD lanes as 0, 1, 1, 0. + scalar_type lanes[2] = { + load_attrib_scalar(va, src), + load_attrib_scalar(va, src + va.stride)}; + attrib = (T){lanes[0], lanes[1], lanes[1], lanes[0]}; + break; + } + case 3: { + // Triangles must be indexed at offsets 0, 1, 2. + // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2. + scalar_type lanes[3] = { + load_attrib_scalar(va, src), + load_attrib_scalar(va, src + va.stride), + load_attrib_scalar(va, src + va.stride * 2)}; + attrib = (T){lanes[0], lanes[1], lanes[2], lanes[2]}; + break; + } + default: + // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, + // 1, 3. Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so + // that the points form a convex path that can be traversed by the + // rasterizer. + attrib = (T){load_attrib_scalar(va, src), + load_attrib_scalar(va, src + va.stride), + load_attrib_scalar(va, src + va.stride * 3), + load_attrib_scalar(va, src + va.stride * 2)}; + break; + } + } +} + +template +void load_flat_attrib(T& attrib, VertexAttrib& va, uint32_t start, int instance, + int count) { + typedef decltype(force_scalar(attrib)) scalar_type; + // If no buffer is available, just use a zero default. + if (!va.buf_size) { + attrib = T{0}; + return; + } + char* src = nullptr; + if (va.divisor != 0) { + src = (char*)va.buf + va.stride * instance + va.offset; + } else { + if (!count) return; + src = (char*)va.buf + va.stride * start + va.offset; + } + assert(src + va.size <= va.buf + va.buf_size); + attrib = T(load_attrib_scalar(va, src)); +} + +void setup_program(GLuint program) { + if (!program) { + vertex_shader = nullptr; + fragment_shader = nullptr; + return; + } + Program& p = ctx->programs[program]; + assert(p.impl); + assert(p.vert_impl); + assert(p.frag_impl); + vertex_shader = p.vert_impl; + fragment_shader = p.frag_impl; +} + +extern ProgramLoader load_shader(const char* name); + +extern "C" { + +void UseProgram(GLuint program) { + if (ctx->current_program && program != ctx->current_program) { + auto* p = ctx->programs.find(ctx->current_program); + if (p && p->deleted) { + ctx->programs.erase(ctx->current_program); + } + } + ctx->current_program = program; + setup_program(program); +} + +void SetViewport(GLint x, GLint y, GLsizei width, GLsizei height) { + ctx->viewport = IntRect{x, y, x + width, y + height}; +} + +void Enable(GLenum cap) { + switch (cap) { + case GL_BLEND: + ctx->blend = true; + break; + case GL_DEPTH_TEST: + ctx->depthtest = true; + break; + case GL_SCISSOR_TEST: + ctx->scissortest = true; + break; + } +} + +void Disable(GLenum cap) { + switch (cap) { + case GL_BLEND: + ctx->blend = false; + break; + case GL_DEPTH_TEST: + ctx->depthtest = false; + break; + case GL_SCISSOR_TEST: + ctx->scissortest = false; + break; + } +} + +// Report the last error generated and clear the error status. +GLenum GetError() { + GLenum error = ctx->last_error; + ctx->last_error = GL_NO_ERROR; + return error; +} + +// Sets the error status to out-of-memory to indicate that a buffer +// or texture re-allocation failed. +static void out_of_memory() { ctx->last_error = GL_OUT_OF_MEMORY; } + +static const char* const extensions[] = { + "GL_ARB_blend_func_extended", + "GL_ARB_clear_texture", + "GL_ARB_copy_image", + "GL_ARB_draw_instanced", + "GL_ARB_explicit_attrib_location", + "GL_ARB_instanced_arrays", + "GL_ARB_invalidate_subdata", + "GL_ARB_texture_storage", + "GL_EXT_timer_query", + "GL_KHR_blend_equation_advanced", + "GL_KHR_blend_equation_advanced_coherent", + "GL_APPLE_rgb_422", +}; + +void GetIntegerv(GLenum pname, GLint* params) { + assert(params); + switch (pname) { + case GL_MAX_TEXTURE_UNITS: + case GL_MAX_TEXTURE_IMAGE_UNITS: + params[0] = MAX_TEXTURE_UNITS; + break; + case GL_MAX_TEXTURE_SIZE: + params[0] = 1 << 15; + break; + case GL_MAX_ARRAY_TEXTURE_LAYERS: + params[0] = 0; + break; + case GL_READ_FRAMEBUFFER_BINDING: + params[0] = ctx->read_framebuffer_binding; + break; + case GL_DRAW_FRAMEBUFFER_BINDING: + params[0] = ctx->draw_framebuffer_binding; + break; + case GL_PIXEL_PACK_BUFFER_BINDING: + params[0] = ctx->pixel_pack_buffer_binding; + break; + case GL_PIXEL_UNPACK_BUFFER_BINDING: + params[0] = ctx->pixel_unpack_buffer_binding; + break; + case GL_NUM_EXTENSIONS: + params[0] = sizeof(extensions) / sizeof(extensions[0]); + break; + case GL_MAJOR_VERSION: + params[0] = 3; + break; + case GL_MINOR_VERSION: + params[0] = 2; + break; + case GL_MIN_PROGRAM_TEXEL_OFFSET: + params[0] = 0; + break; + case GL_MAX_PROGRAM_TEXEL_OFFSET: + params[0] = MAX_TEXEL_OFFSET; + break; + default: + debugf("unhandled glGetIntegerv parameter %x\n", pname); + assert(false); + } +} + +void GetBooleanv(GLenum pname, GLboolean* params) { + assert(params); + switch (pname) { + case GL_DEPTH_WRITEMASK: + params[0] = ctx->depthmask; + break; + default: + debugf("unhandled glGetBooleanv parameter %x\n", pname); + assert(false); + } +} + +const char* GetString(GLenum name) { + switch (name) { + case GL_VENDOR: + return "Mozilla Gfx"; + case GL_RENDERER: + return "Software WebRender"; + case GL_VERSION: + return "3.2"; + case GL_SHADING_LANGUAGE_VERSION: + return "1.50"; + default: + debugf("unhandled glGetString parameter %x\n", name); + assert(false); + return nullptr; + } +} + +const char* GetStringi(GLenum name, GLuint index) { + switch (name) { + case GL_EXTENSIONS: + if (index >= sizeof(extensions) / sizeof(extensions[0])) { + return nullptr; + } + return extensions[index]; + default: + debugf("unhandled glGetStringi parameter %x\n", name); + assert(false); + return nullptr; + } +} + +GLenum remap_blendfunc(GLenum rgb, GLenum a) { + switch (a) { + case GL_SRC_ALPHA: + if (rgb == GL_SRC_COLOR) a = GL_SRC_COLOR; + break; + case GL_ONE_MINUS_SRC_ALPHA: + if (rgb == GL_ONE_MINUS_SRC_COLOR) a = GL_ONE_MINUS_SRC_COLOR; + break; + case GL_DST_ALPHA: + if (rgb == GL_DST_COLOR) a = GL_DST_COLOR; + break; + case GL_ONE_MINUS_DST_ALPHA: + if (rgb == GL_ONE_MINUS_DST_COLOR) a = GL_ONE_MINUS_DST_COLOR; + break; + case GL_CONSTANT_ALPHA: + if (rgb == GL_CONSTANT_COLOR) a = GL_CONSTANT_COLOR; + break; + case GL_ONE_MINUS_CONSTANT_ALPHA: + if (rgb == GL_ONE_MINUS_CONSTANT_COLOR) a = GL_ONE_MINUS_CONSTANT_COLOR; + break; + case GL_SRC_COLOR: + if (rgb == GL_SRC_ALPHA) a = GL_SRC_ALPHA; + break; + case GL_ONE_MINUS_SRC_COLOR: + if (rgb == GL_ONE_MINUS_SRC_ALPHA) a = GL_ONE_MINUS_SRC_ALPHA; + break; + case GL_DST_COLOR: + if (rgb == GL_DST_ALPHA) a = GL_DST_ALPHA; + break; + case GL_ONE_MINUS_DST_COLOR: + if (rgb == GL_ONE_MINUS_DST_ALPHA) a = GL_ONE_MINUS_DST_ALPHA; + break; + case GL_CONSTANT_COLOR: + if (rgb == GL_CONSTANT_ALPHA) a = GL_CONSTANT_ALPHA; + break; + case GL_ONE_MINUS_CONSTANT_COLOR: + if (rgb == GL_ONE_MINUS_CONSTANT_ALPHA) a = GL_ONE_MINUS_CONSTANT_ALPHA; + break; + case GL_SRC1_ALPHA: + if (rgb == GL_SRC1_COLOR) a = GL_SRC1_COLOR; + break; + case GL_ONE_MINUS_SRC1_ALPHA: + if (rgb == GL_ONE_MINUS_SRC1_COLOR) a = GL_ONE_MINUS_SRC1_COLOR; + break; + case GL_SRC1_COLOR: + if (rgb == GL_SRC1_ALPHA) a = GL_SRC1_ALPHA; + break; + case GL_ONE_MINUS_SRC1_COLOR: + if (rgb == GL_ONE_MINUS_SRC1_ALPHA) a = GL_ONE_MINUS_SRC1_ALPHA; + break; + } + return a; +} + +// Generate a hashed blend key based on blend func and equation state. This +// allows all the blend state to be processed down to a blend key that can be +// dealt with inside a single switch statement. +static void hash_blend_key() { + GLenum srgb = ctx->blendfunc_srgb; + GLenum drgb = ctx->blendfunc_drgb; + GLenum sa = ctx->blendfunc_sa; + GLenum da = ctx->blendfunc_da; + GLenum equation = ctx->blend_equation; +#define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20)) + // Basic non-separate blend funcs used the two argument form + int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0); + // Separate alpha blend funcs use the 4 argument hash + if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da); + // Any other blend equation than the default func_add ignores the func and + // instead generates a one-argument hash based on the equation + if (equation != GL_FUNC_ADD) hash = HASH_BLEND_KEY(equation, 0, 0, 0); + switch (hash) { +#define MAP_BLEND_KEY(...) \ + case HASH_BLEND_KEY(__VA_ARGS__): \ + ctx->blend_key = BLEND_KEY(__VA_ARGS__); \ + break; + FOR_EACH_BLEND_KEY(MAP_BLEND_KEY) + default: + debugf("blendfunc: %x, %x, separate: %x, %x, equation: %x\n", srgb, drgb, + sa, da, equation); + assert(false); + break; + } +} + +void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) { + ctx->blendfunc_srgb = srgb; + ctx->blendfunc_drgb = drgb; + sa = remap_blendfunc(srgb, sa); + da = remap_blendfunc(drgb, da); + ctx->blendfunc_sa = sa; + ctx->blendfunc_da = da; + + hash_blend_key(); +} + +void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { + I32 c = round_pixel((Float){b, g, r, a}); + ctx->blendcolor = CONVERT(c, U16).xyzwxyzw; +} + +void BlendEquation(GLenum mode) { + assert(mode == GL_FUNC_ADD || mode == GL_MIN || mode == GL_MAX || + (mode >= GL_MULTIPLY_KHR && mode <= GL_HSL_LUMINOSITY_KHR)); + if (mode != ctx->blend_equation) { + ctx->blend_equation = mode; + hash_blend_key(); + } +} + +void DepthMask(GLboolean flag) { ctx->depthmask = flag; } + +void DepthFunc(GLenum func) { + switch (func) { + case GL_LESS: + case GL_LEQUAL: + break; + default: + assert(false); + } + ctx->depthfunc = func; +} + +void SetScissor(GLint x, GLint y, GLsizei width, GLsizei height) { + ctx->scissor = IntRect{x, y, x + width, y + height}; +} + +void ClearColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { + ctx->clearcolor[0] = r; + ctx->clearcolor[1] = g; + ctx->clearcolor[2] = b; + ctx->clearcolor[3] = a; +} + +void ClearDepth(GLdouble depth) { ctx->cleardepth = depth; } + +void ActiveTexture(GLenum texture) { + assert(texture >= GL_TEXTURE0); + assert(texture < GL_TEXTURE0 + MAX_TEXTURE_UNITS); + ctx->active_texture_unit = + clamp(int(texture - GL_TEXTURE0), 0, int(MAX_TEXTURE_UNITS - 1)); +} + +void GenQueries(GLsizei n, GLuint* result) { + for (int i = 0; i < n; i++) { + Query q; + result[i] = ctx->queries.insert(q); + } +} + +void DeleteQuery(GLuint n) { + if (n && ctx->queries.erase(n)) { + unlink(ctx->time_elapsed_query, n); + unlink(ctx->samples_passed_query, n); + } +} + +void GenBuffers(int n, GLuint* result) { + for (int i = 0; i < n; i++) { + Buffer b; + result[i] = ctx->buffers.insert(b); + } +} + +void DeleteBuffer(GLuint n) { + if (n && ctx->buffers.erase(n)) { + unlink(ctx->pixel_pack_buffer_binding, n); + unlink(ctx->pixel_unpack_buffer_binding, n); + unlink(ctx->array_buffer_binding, n); + } +} + +void GenVertexArrays(int n, GLuint* result) { + for (int i = 0; i < n; i++) { + VertexArray v; + result[i] = ctx->vertex_arrays.insert(v); + } +} + +void DeleteVertexArray(GLuint n) { + if (n && ctx->vertex_arrays.erase(n)) { + unlink(ctx->current_vertex_array, n); + } +} + +GLuint CreateShader(GLenum type) { + Shader s; + s.type = type; + return ctx->shaders.insert(s); +} + +void ShaderSourceByName(GLuint shader, char* name) { + Shader& s = ctx->shaders[shader]; + s.loader = load_shader(name); + if (!s.loader) { + debugf("unknown shader %s\n", name); + } +} + +void AttachShader(GLuint program, GLuint shader) { + Program& p = ctx->programs[program]; + Shader& s = ctx->shaders[shader]; + if (s.type == GL_VERTEX_SHADER) { + if (!p.impl && s.loader) p.impl = s.loader(); + } else if (s.type == GL_FRAGMENT_SHADER) { + if (!p.impl && s.loader) p.impl = s.loader(); + } else { + assert(0); + } +} + +void DeleteShader(GLuint n) { + if (n) ctx->shaders.erase(n); +} + +GLuint CreateProgram() { + Program p; + return ctx->programs.insert(p); +} + +void DeleteProgram(GLuint n) { + if (!n) return; + if (ctx->current_program == n) { + if (auto* p = ctx->programs.find(n)) { + p->deleted = true; + } + } else { + ctx->programs.erase(n); + } +} + +void LinkProgram(GLuint program) { + Program& p = ctx->programs[program]; + assert(p.impl); + if (!p.impl) { + return; + } + assert(p.impl->interpolants_size() <= sizeof(Interpolants)); + if (!p.vert_impl) p.vert_impl = p.impl->get_vertex_shader(); + if (!p.frag_impl) p.frag_impl = p.impl->get_fragment_shader(); +} + +GLint GetLinkStatus(GLuint program) { + if (auto* p = ctx->programs.find(program)) { + return p->impl ? 1 : 0; + } + return 0; +} + +void BindAttribLocation(GLuint program, GLuint index, char* name) { + Program& p = ctx->programs[program]; + assert(p.impl); + if (!p.impl) { + return; + } + p.impl->bind_attrib(name, index); +} + +GLint GetAttribLocation(GLuint program, char* name) { + Program& p = ctx->programs[program]; + assert(p.impl); + if (!p.impl) { + return -1; + } + return p.impl->get_attrib(name); +} + +GLint GetUniformLocation(GLuint program, char* name) { + Program& p = ctx->programs[program]; + assert(p.impl); + if (!p.impl) { + return -1; + } + GLint loc = p.impl->get_uniform(name); + // debugf("location: %d\n", loc); + return loc; +} + +static uint64_t get_time_value() { +#ifdef __MACH__ + return mach_absolute_time(); +#elif defined(_WIN32) + LARGE_INTEGER time; + static bool have_frequency = false; + static LARGE_INTEGER frequency; + if (!have_frequency) { + QueryPerformanceFrequency(&frequency); + have_frequency = true; + } + QueryPerformanceCounter(&time); + return time.QuadPart * 1000000000ULL / frequency.QuadPart; +#else + return ({ + struct timespec tp; + clock_gettime(CLOCK_MONOTONIC, &tp); + tp.tv_sec * 1000000000ULL + tp.tv_nsec; + }); +#endif +} + +void BeginQuery(GLenum target, GLuint id) { + ctx->get_binding(target) = id; + Query& q = ctx->queries[id]; + switch (target) { + case GL_SAMPLES_PASSED: + q.value = 0; + break; + case GL_TIME_ELAPSED: + q.value = get_time_value(); + break; + default: + debugf("unknown query target %x for query %d\n", target, id); + assert(false); + } +} + +void EndQuery(GLenum target) { + Query& q = ctx->queries[ctx->get_binding(target)]; + switch (target) { + case GL_SAMPLES_PASSED: + break; + case GL_TIME_ELAPSED: + q.value = get_time_value() - q.value; + break; + default: + debugf("unknown query target %x\n", target); + assert(false); + } + ctx->get_binding(target) = 0; +} + +void GetQueryObjectui64v(GLuint id, GLenum pname, GLuint64* params) { + Query& q = ctx->queries[id]; + switch (pname) { + case GL_QUERY_RESULT: + assert(params); + params[0] = q.value; + break; + default: + assert(false); + } +} + +void BindVertexArray(GLuint vertex_array) { + if (vertex_array != ctx->current_vertex_array) { + ctx->validate_vertex_array = true; + } + ctx->current_vertex_array = vertex_array; +} + +void BindTexture(GLenum target, GLuint texture) { + ctx->get_binding(target) = texture; +} + +void BindBuffer(GLenum target, GLuint buffer) { + ctx->get_binding(target) = buffer; +} + +void BindFramebuffer(GLenum target, GLuint fb) { + if (target == GL_FRAMEBUFFER) { + ctx->read_framebuffer_binding = fb; + ctx->draw_framebuffer_binding = fb; + } else { + assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER); + ctx->get_binding(target) = fb; + } +} + +void BindRenderbuffer(GLenum target, GLuint rb) { + ctx->get_binding(target) = rb; +} + +void PixelStorei(GLenum name, GLint param) { + if (name == GL_UNPACK_ALIGNMENT) { + assert(param == 1); + } else if (name == GL_UNPACK_ROW_LENGTH) { + ctx->unpack_row_length = param; + } +} + +static GLenum remap_internal_format(GLenum format) { + switch (format) { + case GL_DEPTH_COMPONENT: + return GL_DEPTH_COMPONENT24; + case GL_RGBA: + return GL_RGBA8; + case GL_RED: + return GL_R8; + case GL_RG: + return GL_RG8; + case GL_RGB_422_APPLE: + return GL_RGB_RAW_422_APPLE; + default: + return format; + } +} + +} // extern "C" + +static bool format_requires_conversion(GLenum external_format, + GLenum internal_format) { + switch (external_format) { + case GL_RGBA: + return internal_format == GL_RGBA8; + default: + return false; + } +} + +static inline void copy_bgra8_to_rgba8(uint32_t* dest, const uint32_t* src, + int width) { + for (; width >= 4; width -= 4, dest += 4, src += 4) { + U32 p = unaligned_load(src); + U32 rb = p & 0x00FF00FF; + unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16)); + } + for (; width > 0; width--, dest++, src++) { + uint32_t p = *src; + uint32_t rb = p & 0x00FF00FF; + *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16); + } +} + +static void convert_copy(GLenum external_format, GLenum internal_format, + uint8_t* dst_buf, size_t dst_stride, + const uint8_t* src_buf, size_t src_stride, + size_t width, size_t height) { + switch (external_format) { + case GL_RGBA: + if (internal_format == GL_RGBA8) { + for (; height; height--) { + copy_bgra8_to_rgba8((uint32_t*)dst_buf, (const uint32_t*)src_buf, + width); + dst_buf += dst_stride; + src_buf += src_stride; + } + return; + } + break; + default: + break; + } + size_t row_bytes = width * bytes_for_internal_format(internal_format); + for (; height; height--) { + memcpy(dst_buf, src_buf, row_bytes); + dst_buf += dst_stride; + src_buf += src_stride; + } +} + +static void set_tex_storage(Texture& t, GLenum external_format, GLsizei width, + GLsizei height, void* buf = nullptr, + GLsizei stride = 0, GLsizei min_width = 0, + GLsizei min_height = 0) { + GLenum internal_format = remap_internal_format(external_format); + bool changed = false; + if (t.width != width || t.height != height || + t.internal_format != internal_format) { + changed = true; + t.internal_format = internal_format; + t.width = width; + t.height = height; + } + // If we are changed from an internally managed buffer to an externally + // supplied one or vice versa, ensure that we clean up old buffer state. + // However, if we have to convert the data from a non-native format, then + // always treat it as internally managed since we will need to copy to an + // internally managed native format buffer. + bool should_free = buf == nullptr || format_requires_conversion( + external_format, internal_format); + if (t.should_free() != should_free) { + changed = true; + t.cleanup(); + t.set_should_free(should_free); + } + // If now an external buffer, explicitly set it... + if (!should_free) { + t.set_buffer(buf, stride); + } + t.disable_delayed_clear(); + if (!t.allocate(changed, min_width, min_height)) { + out_of_memory(); + } + // If we have a buffer that needs format conversion, then do that now. + if (buf && should_free) { + convert_copy(external_format, internal_format, (uint8_t*)t.buf, t.stride(), + (const uint8_t*)buf, stride, width, height); + } +} + +extern "C" { + +void TexStorage2D(GLenum target, GLint levels, GLenum internal_format, + GLsizei width, GLsizei height) { + assert(levels == 1); + Texture& t = ctx->textures[ctx->get_binding(target)]; + set_tex_storage(t, internal_format, width, height); +} + +GLenum internal_format_for_data(GLenum format, GLenum ty) { + if (format == GL_RED && ty == GL_UNSIGNED_BYTE) { + return GL_R8; + } else if ((format == GL_RGBA || format == GL_BGRA) && + (ty == GL_UNSIGNED_BYTE || ty == GL_UNSIGNED_INT_8_8_8_8_REV)) { + return GL_RGBA8; + } else if (format == GL_RGBA && ty == GL_FLOAT) { + return GL_RGBA32F; + } else if (format == GL_RGBA_INTEGER && ty == GL_INT) { + return GL_RGBA32I; + } else if (format == GL_RG && ty == GL_UNSIGNED_BYTE) { + return GL_RG8; + } else if (format == GL_RGB_422_APPLE && + ty == GL_UNSIGNED_SHORT_8_8_REV_APPLE) { + return GL_RGB_RAW_422_APPLE; + } else if (format == GL_RED && ty == GL_UNSIGNED_SHORT) { + return GL_R16; + } else if (format == GL_RG && ty == GL_UNSIGNED_SHORT) { + return GL_RG16; + } else { + debugf("unknown internal format for format %x, type %x\n", format, ty); + assert(false); + return 0; + } +} + +static Buffer* get_pixel_pack_buffer() { + return ctx->pixel_pack_buffer_binding + ? &ctx->buffers[ctx->pixel_pack_buffer_binding] + : nullptr; +} + +static void* get_pixel_pack_buffer_data(void* data) { + if (Buffer* b = get_pixel_pack_buffer()) { + return b->buf ? b->buf + (size_t)data : nullptr; + } + return data; +} + +static Buffer* get_pixel_unpack_buffer() { + return ctx->pixel_unpack_buffer_binding + ? &ctx->buffers[ctx->pixel_unpack_buffer_binding] + : nullptr; +} + +static void* get_pixel_unpack_buffer_data(void* data) { + if (Buffer* b = get_pixel_unpack_buffer()) { + return b->buf ? b->buf + (size_t)data : nullptr; + } + return data; +} + +void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, + GLsizei width, GLsizei height, GLenum format, GLenum ty, + void* data) { + if (level != 0) { + assert(false); + return; + } + data = get_pixel_unpack_buffer_data(data); + if (!data) return; + Texture& t = ctx->textures[ctx->get_binding(target)]; + IntRect skip = {xoffset, yoffset, xoffset + width, yoffset + height}; + prepare_texture(t, &skip); + assert(xoffset + width <= t.width); + assert(yoffset + height <= t.height); + assert(ctx->unpack_row_length == 0 || ctx->unpack_row_length >= width); + GLsizei row_length = + ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width; + assert(t.internal_format == internal_format_for_data(format, ty)); + int src_bpp = format_requires_conversion(format, t.internal_format) + ? bytes_for_internal_format(format) + : t.bpp(); + if (!src_bpp || !t.buf) return; + convert_copy(format, t.internal_format, + (uint8_t*)t.sample_ptr(xoffset, yoffset), t.stride(), + (const uint8_t*)data, row_length * src_bpp, width, height); +} + +void TexImage2D(GLenum target, GLint level, GLint internal_format, + GLsizei width, GLsizei height, GLint border, GLenum format, + GLenum ty, void* data) { + if (level != 0) { + assert(false); + return; + } + assert(border == 0); + TexStorage2D(target, 1, internal_format, width, height); + TexSubImage2D(target, 0, 0, 0, width, height, format, ty, data); +} + +void GenerateMipmap(UNUSED GLenum target) { + // TODO: support mipmaps +} + +void SetTextureParameter(GLuint texid, GLenum pname, GLint param) { + Texture& t = ctx->textures[texid]; + switch (pname) { + case GL_TEXTURE_WRAP_S: + assert(param == GL_CLAMP_TO_EDGE); + break; + case GL_TEXTURE_WRAP_T: + assert(param == GL_CLAMP_TO_EDGE); + break; + case GL_TEXTURE_MIN_FILTER: + t.min_filter = param; + break; + case GL_TEXTURE_MAG_FILTER: + t.mag_filter = param; + break; + default: + break; + } +} + +void TexParameteri(GLenum target, GLenum pname, GLint param) { + SetTextureParameter(ctx->get_binding(target), pname, param); +} + +void GenTextures(int n, GLuint* result) { + for (int i = 0; i < n; i++) { + Texture t; + result[i] = ctx->textures.insert(t); + } +} + +void DeleteTexture(GLuint n) { + if (n && ctx->textures.erase(n)) { + for (size_t i = 0; i < MAX_TEXTURE_UNITS; i++) { + ctx->texture_units[i].unlink(n); + } + } +} + +void GenRenderbuffers(int n, GLuint* result) { + for (int i = 0; i < n; i++) { + Renderbuffer r; + result[i] = ctx->renderbuffers.insert(r); + } +} + +void Renderbuffer::on_erase() { + for (auto* fb : ctx->framebuffers) { + if (fb) { + unlink(fb->color_attachment, texture); + unlink(fb->depth_attachment, texture); + } + } + DeleteTexture(texture); +} + +void DeleteRenderbuffer(GLuint n) { + if (n && ctx->renderbuffers.erase(n)) { + unlink(ctx->renderbuffer_binding, n); + } +} + +void GenFramebuffers(int n, GLuint* result) { + for (int i = 0; i < n; i++) { + Framebuffer f; + result[i] = ctx->framebuffers.insert(f); + } +} + +void DeleteFramebuffer(GLuint n) { + if (n && ctx->framebuffers.erase(n)) { + unlink(ctx->read_framebuffer_binding, n); + unlink(ctx->draw_framebuffer_binding, n); + } +} + +void RenderbufferStorage(GLenum target, GLenum internal_format, GLsizei width, + GLsizei height) { + // Just refer a renderbuffer to a texture to simplify things for now... + Renderbuffer& r = ctx->renderbuffers[ctx->get_binding(target)]; + if (!r.texture) { + GenTextures(1, &r.texture); + } + switch (internal_format) { + case GL_DEPTH_COMPONENT: + case GL_DEPTH_COMPONENT16: + case GL_DEPTH_COMPONENT24: + case GL_DEPTH_COMPONENT32: + // Force depth format to 24 bits... + internal_format = GL_DEPTH_COMPONENT24; + break; + } + set_tex_storage(ctx->textures[r.texture], internal_format, width, height); +} + +void VertexAttribPointer(GLuint index, GLint size, GLenum type, bool normalized, + GLsizei stride, GLuint offset) { + // debugf("cva: %d\n", ctx->current_vertex_array); + VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array]; + if (index >= NULL_ATTRIB) { + assert(0); + return; + } + VertexAttrib& va = v.attribs[index]; + va.size = size * bytes_per_type(type); + va.type = type; + va.normalized = normalized; + va.stride = stride; + va.offset = offset; + // Buffer &vertex_buf = ctx->buffers[ctx->array_buffer_binding]; + va.vertex_buffer = ctx->array_buffer_binding; + va.vertex_array = ctx->current_vertex_array; + ctx->validate_vertex_array = true; +} + +void VertexAttribIPointer(GLuint index, GLint size, GLenum type, GLsizei stride, + GLuint offset) { + // debugf("cva: %d\n", ctx->current_vertex_array); + VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array]; + if (index >= NULL_ATTRIB) { + assert(0); + return; + } + VertexAttrib& va = v.attribs[index]; + va.size = size * bytes_per_type(type); + va.type = type; + va.normalized = false; + va.stride = stride; + va.offset = offset; + // Buffer &vertex_buf = ctx->buffers[ctx->array_buffer_binding]; + va.vertex_buffer = ctx->array_buffer_binding; + va.vertex_array = ctx->current_vertex_array; + ctx->validate_vertex_array = true; +} + +void EnableVertexAttribArray(GLuint index) { + VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array]; + if (index >= NULL_ATTRIB) { + assert(0); + return; + } + VertexAttrib& va = v.attribs[index]; + if (!va.enabled) { + ctx->validate_vertex_array = true; + } + va.enabled = true; + v.max_attrib = max(v.max_attrib, (int)index); +} + +void DisableVertexAttribArray(GLuint index) { + VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array]; + if (index >= NULL_ATTRIB) { + assert(0); + return; + } + VertexAttrib& va = v.attribs[index]; + if (va.enabled) { + ctx->validate_vertex_array = true; + } + va.disable(); +} + +void VertexAttribDivisor(GLuint index, GLuint divisor) { + VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array]; + // Only support divisor being 0 (per-vertex) or 1 (per-instance). + if (index >= NULL_ATTRIB || divisor > 1) { + assert(0); + return; + } + VertexAttrib& va = v.attribs[index]; + va.divisor = divisor; +} + +void BufferData(GLenum target, GLsizeiptr size, void* data, + UNUSED GLenum usage) { + Buffer& b = ctx->buffers[ctx->get_binding(target)]; + if (size != b.size) { + if (!b.allocate(size)) { + out_of_memory(); + } + ctx->validate_vertex_array = true; + } + if (data && b.buf && size <= b.size) { + memcpy(b.buf, data, size); + } +} + +void BufferSubData(GLenum target, GLintptr offset, GLsizeiptr size, + void* data) { + Buffer& b = ctx->buffers[ctx->get_binding(target)]; + assert(offset + size <= b.size); + if (data && b.buf && offset + size <= b.size) { + memcpy(&b.buf[offset], data, size); + } +} + +void* MapBuffer(GLenum target, UNUSED GLbitfield access) { + Buffer& b = ctx->buffers[ctx->get_binding(target)]; + return b.buf; +} + +void* MapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length, + UNUSED GLbitfield access) { + Buffer& b = ctx->buffers[ctx->get_binding(target)]; + if (b.buf && offset >= 0 && length > 0 && offset + length <= b.size) { + return b.buf + offset; + } + return nullptr; +} + +GLboolean UnmapBuffer(GLenum target) { + Buffer& b = ctx->buffers[ctx->get_binding(target)]; + return b.buf != nullptr; +} + +void Uniform1i(GLint location, GLint V0) { + // debugf("tex: %d\n", (int)ctx->textures.size); + if (vertex_shader) { + vertex_shader->set_uniform_1i(location, V0); + } +} +void Uniform4fv(GLint location, GLsizei count, const GLfloat* v) { + assert(count == 1); + if (vertex_shader) { + vertex_shader->set_uniform_4fv(location, v); + } +} +void UniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose, + const GLfloat* value) { + assert(count == 1); + assert(!transpose); + if (vertex_shader) { + vertex_shader->set_uniform_matrix4fv(location, value); + } +} + +void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, + GLuint texture, GLint level) { + assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER); + assert(textarget == GL_TEXTURE_2D || textarget == GL_TEXTURE_RECTANGLE); + assert(level == 0); + Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)]; + if (attachment == GL_COLOR_ATTACHMENT0) { + fb.color_attachment = texture; + } else if (attachment == GL_DEPTH_ATTACHMENT) { + fb.depth_attachment = texture; + } else { + assert(0); + } +} + +void FramebufferRenderbuffer(GLenum target, GLenum attachment, + GLenum renderbuffertarget, GLuint renderbuffer) { + assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER); + assert(renderbuffertarget == GL_RENDERBUFFER); + Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)]; + Renderbuffer& rb = ctx->renderbuffers[renderbuffer]; + if (attachment == GL_COLOR_ATTACHMENT0) { + fb.color_attachment = rb.texture; + } else if (attachment == GL_DEPTH_ATTACHMENT) { + fb.depth_attachment = rb.texture; + } else { + assert(0); + } +} + +} // extern "C" + +static inline Framebuffer* get_framebuffer(GLenum target, + bool fallback = false) { + if (target == GL_FRAMEBUFFER) { + target = GL_DRAW_FRAMEBUFFER; + } + Framebuffer* fb = ctx->framebuffers.find(ctx->get_binding(target)); + if (fallback && !fb) { + // If the specified framebuffer isn't found and a fallback is requested, + // use the default framebuffer. + fb = &ctx->framebuffers[0]; + } + return fb; +} + +template +static inline void fill_n(T* dst, size_t n, T val) { + for (T* end = &dst[n]; dst < end; dst++) *dst = val; +} + +#if USE_SSE2 +template <> +inline void fill_n(uint32_t* dst, size_t n, uint32_t val) { + __asm__ __volatile__("rep stosl\n" + : "+D"(dst), "+c"(n) + : "a"(val) + : "memory", "cc"); +} +#endif + +static inline uint32_t clear_chunk(uint8_t value) { + return uint32_t(value) * 0x01010101U; +} + +static inline uint32_t clear_chunk(uint16_t value) { + return uint32_t(value) | (uint32_t(value) << 16); +} + +static inline uint32_t clear_chunk(uint32_t value) { return value; } + +template +static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) { + const size_t N = sizeof(uint32_t) / sizeof(T); + // fill any leading unaligned values + if (N > 1) { + size_t align = (-(intptr_t)buf & (sizeof(uint32_t) - 1)) / sizeof(T); + if (align <= len) { + fill_n(buf, align, value); + len -= align; + buf += align; + } + } + // fill as many aligned chunks as possible + fill_n((uint32_t*)buf, len / N, chunk); + // fill any remaining values + if (N > 1) { + fill_n(buf + (len & ~(N - 1)), len & (N - 1), value); + } +} + +template +static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0, + int skip_end = 0) { + if (!t.buf) return; + skip_start = max(skip_start, bb.x0); + skip_end = max(skip_end, skip_start); + assert(sizeof(T) == t.bpp()); + size_t stride = t.stride(); + // When clearing multiple full-width rows, collapse them into a single large + // "row" to avoid redundant setup from clearing each row individually. Note + // that we can only safely do this if the stride is tightly packed. + if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end && + (t.should_free() || stride == t.width * sizeof(T))) { + bb.x1 += (stride / sizeof(T)) * (bb.height() - 1); + bb.y1 = bb.y0 + 1; + } + T* buf = (T*)t.sample_ptr(bb.x0, bb.y0); + uint32_t chunk = clear_chunk(value); + for (int rows = bb.height(); rows > 0; rows--) { + if (bb.x0 < skip_start) { + clear_row(buf, skip_start - bb.x0, value, chunk); + } + if (skip_end < bb.x1) { + clear_row(buf + (skip_end - bb.x0), bb.x1 - skip_end, value, chunk); + } + buf += stride / sizeof(T); + } +} + +template +static inline void force_clear_row(Texture& t, int y, int skip_start = 0, + int skip_end = 0) { + assert(t.buf != nullptr); + assert(sizeof(T) == t.bpp()); + assert(skip_start <= skip_end); + T* buf = (T*)t.sample_ptr(0, y); + uint32_t chunk = clear_chunk((T)t.clear_val); + if (skip_start > 0) { + clear_row(buf, skip_start, t.clear_val, chunk); + } + if (skip_end < t.width) { + clear_row(buf + skip_end, t.width - skip_end, t.clear_val, chunk); + } +} + +template +static void force_clear(Texture& t, const IntRect* skip = nullptr) { + if (!t.delay_clear || !t.cleared_rows) { + return; + } + int y0 = 0; + int y1 = t.height; + int skip_start = 0; + int skip_end = 0; + if (skip) { + y0 = clamp(skip->y0, 0, t.height); + y1 = clamp(skip->y1, y0, t.height); + skip_start = clamp(skip->x0, 0, t.width); + skip_end = clamp(skip->x1, skip_start, t.width); + if (skip_start <= 0 && skip_end >= t.width && y0 <= 0 && y1 >= t.height) { + t.disable_delayed_clear(); + return; + } + } + int num_masks = (y1 + 31) / 32; + uint32_t* rows = t.cleared_rows; + for (int i = y0 / 32; i < num_masks; i++) { + uint32_t mask = rows[i]; + if (mask != ~0U) { + rows[i] = ~0U; + int start = i * 32; + while (mask) { + int count = __builtin_ctz(mask); + if (count > 0) { + clear_buffer(t, t.clear_val, + IntRect{0, start, t.width, start + count}, skip_start, + skip_end); + t.delay_clear -= count; + start += count; + mask >>= count; + } + count = __builtin_ctz(mask + 1); + start += count; + mask >>= count; + } + int count = (i + 1) * 32 - start; + if (count > 0) { + clear_buffer(t, t.clear_val, + IntRect{0, start, t.width, start + count}, skip_start, + skip_end); + t.delay_clear -= count; + } + } + } + if (t.delay_clear <= 0) t.disable_delayed_clear(); +} + +static void prepare_texture(Texture& t, const IntRect* skip) { + if (t.delay_clear) { + switch (t.internal_format) { + case GL_RGBA8: + force_clear(t, skip); + break; + case GL_R8: + force_clear(t, skip); + break; + case GL_RG8: + force_clear(t, skip); + break; + default: + assert(false); + break; + } + } +} + +// Setup a clear on a texture. This may either force an immediate clear or +// potentially punt to a delayed clear, if applicable. +template +static void request_clear(Texture& t, T value, const IntRect& scissor) { + // If the clear would require a scissor, force clear anything outside + // the scissor, and then immediately clear anything inside the scissor. + if (!scissor.contains(t.offset_bounds())) { + IntRect skip = scissor - t.offset; + force_clear(t, &skip); + clear_buffer(t, value, skip.intersection(t.bounds())); + } else { + // Do delayed clear for 2D texture without scissor. + t.enable_delayed_clear(value); + } +} + +template +static inline void request_clear(Texture& t, T value) { + // If scissoring is enabled, use the scissor rect. Otherwise, just scissor to + // the entire texture bounds. + request_clear(t, value, ctx->scissortest ? ctx->scissor : t.offset_bounds()); +} + +extern "C" { + +void InitDefaultFramebuffer(int x, int y, int width, int height, int stride, + void* buf) { + Framebuffer& fb = ctx->framebuffers[0]; + if (!fb.color_attachment) { + GenTextures(1, &fb.color_attachment); + } + // If the dimensions or buffer properties changed, we need to reallocate + // the underlying storage for the color buffer texture. + Texture& colortex = ctx->textures[fb.color_attachment]; + set_tex_storage(colortex, GL_RGBA8, width, height, buf, stride); + colortex.offset = IntPoint(x, y); + if (!fb.depth_attachment) { + GenTextures(1, &fb.depth_attachment); + } + // Ensure dimensions of the depth buffer match the color buffer. + Texture& depthtex = ctx->textures[fb.depth_attachment]; + set_tex_storage(depthtex, GL_DEPTH_COMPONENT24, width, height); + depthtex.offset = IntPoint(x, y); +} + +void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width, + int32_t* height, int32_t* stride) { + Framebuffer* fb = ctx->framebuffers.find(fbo); + if (!fb || !fb->color_attachment) { + return nullptr; + } + Texture& colortex = ctx->textures[fb->color_attachment]; + if (flush) { + prepare_texture(colortex); + } + assert(colortex.offset == IntPoint(0, 0)); + if (width) { + *width = colortex.width; + } + if (height) { + *height = colortex.height; + } + if (stride) { + *stride = colortex.stride(); + } + return colortex.buf ? colortex.sample_ptr(0, 0) : nullptr; +} + +void ResolveFramebuffer(GLuint fbo) { + Framebuffer* fb = ctx->framebuffers.find(fbo); + if (!fb || !fb->color_attachment) { + return; + } + Texture& colortex = ctx->textures[fb->color_attachment]; + prepare_texture(colortex); +} + +void SetTextureBuffer(GLuint texid, GLenum internal_format, GLsizei width, + GLsizei height, GLsizei stride, void* buf, + GLsizei min_width, GLsizei min_height) { + Texture& t = ctx->textures[texid]; + set_tex_storage(t, internal_format, width, height, buf, stride, min_width, + min_height); +} + +GLenum CheckFramebufferStatus(GLenum target) { + Framebuffer* fb = get_framebuffer(target); + if (!fb || !fb->color_attachment) { + return GL_FRAMEBUFFER_UNSUPPORTED; + } + return GL_FRAMEBUFFER_COMPLETE; +} + +void ClearTexSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset, + GLint zoffset, GLsizei width, GLsizei height, + GLsizei depth, GLenum format, GLenum type, + const void* data) { + if (level != 0) { + assert(false); + return; + } + Texture& t = ctx->textures[texture]; + assert(!t.locked); + if (width <= 0 || height <= 0 || depth <= 0) { + return; + } + assert(zoffset == 0 && depth == 1); + IntRect scissor = {xoffset, yoffset, xoffset + width, yoffset + height}; + if (t.internal_format == GL_DEPTH_COMPONENT24) { + uint32_t value = 0xFFFFFF; + switch (format) { + case GL_DEPTH_COMPONENT: + switch (type) { + case GL_DOUBLE: + value = uint32_t(*(const GLdouble*)data * 0xFFFFFF); + break; + case GL_FLOAT: + value = uint32_t(*(const GLfloat*)data * 0xFFFFFF); + break; + default: + assert(false); + break; + } + break; + default: + assert(false); + break; + } + if (t.cleared() && !scissor.contains(t.offset_bounds())) { + // If we need to scissor the clear and the depth buffer was already + // initialized, then just fill runs for that scissor area. + t.fill_depth_runs(value, scissor); + } else { + // Otherwise, the buffer is either uninitialized or the clear would + // encompass the entire buffer. If uninitialized, we can safely fill + // the entire buffer with any value and thus ignore any scissoring. + t.init_depth_runs(value); + } + return; + } + + uint32_t color = 0xFF000000; + switch (type) { + case GL_FLOAT: { + const GLfloat* f = (const GLfloat*)data; + Float v = {0.0f, 0.0f, 0.0f, 1.0f}; + switch (format) { + case GL_RGBA: + v.w = f[3]; // alpha + FALLTHROUGH; + case GL_RGB: + v.z = f[2]; // blue + FALLTHROUGH; + case GL_RG: + v.y = f[1]; // green + FALLTHROUGH; + case GL_RED: + v.x = f[0]; // red + break; + default: + assert(false); + break; + } + color = bit_cast(CONVERT(round_pixel(v), U8)); + break; + } + case GL_UNSIGNED_BYTE: { + const GLubyte* b = (const GLubyte*)data; + switch (format) { + case GL_RGBA: + color = (color & ~0xFF000000) | (uint32_t(b[3]) << 24); // alpha + FALLTHROUGH; + case GL_RGB: + color = (color & ~0x00FF0000) | (uint32_t(b[2]) << 16); // blue + FALLTHROUGH; + case GL_RG: + color = (color & ~0x0000FF00) | (uint32_t(b[1]) << 8); // green + FALLTHROUGH; + case GL_RED: + color = (color & ~0x000000FF) | uint32_t(b[0]); // red + break; + default: + assert(false); + break; + } + break; + } + default: + assert(false); + break; + } + + switch (t.internal_format) { + case GL_RGBA8: + // Clear color needs to swizzle to BGRA. + request_clear(t, + (color & 0xFF00FF00) | + ((color << 16) & 0xFF0000) | + ((color >> 16) & 0xFF), + scissor); + break; + case GL_R8: + request_clear(t, uint8_t(color & 0xFF), scissor); + break; + case GL_RG8: + request_clear(t, uint16_t(color & 0xFFFF), scissor); + break; + default: + assert(false); + break; + } +} + +void ClearTexImage(GLuint texture, GLint level, GLenum format, GLenum type, + const void* data) { + Texture& t = ctx->textures[texture]; + IntRect scissor = t.offset_bounds(); + ClearTexSubImage(texture, level, scissor.x0, scissor.y0, 0, scissor.width(), + scissor.height(), 1, format, type, data); +} + +void Clear(GLbitfield mask) { + Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true); + if ((mask & GL_COLOR_BUFFER_BIT) && fb.color_attachment) { + Texture& t = ctx->textures[fb.color_attachment]; + IntRect scissor = ctx->scissortest + ? ctx->scissor.intersection(t.offset_bounds()) + : t.offset_bounds(); + ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0, + scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT, + ctx->clearcolor); + } + if ((mask & GL_DEPTH_BUFFER_BIT) && fb.depth_attachment) { + Texture& t = ctx->textures[fb.depth_attachment]; + IntRect scissor = ctx->scissortest + ? ctx->scissor.intersection(t.offset_bounds()) + : t.offset_bounds(); + ClearTexSubImage(fb.depth_attachment, 0, scissor.x0, scissor.y0, 0, + scissor.width(), scissor.height(), 1, GL_DEPTH_COMPONENT, + GL_DOUBLE, &ctx->cleardepth); + } +} + +void ClearColorRect(GLuint fbo, GLint xoffset, GLint yoffset, GLsizei width, + GLsizei height, GLfloat r, GLfloat g, GLfloat b, + GLfloat a) { + GLfloat color[] = {r, g, b, a}; + Framebuffer& fb = ctx->framebuffers[fbo]; + Texture& t = ctx->textures[fb.color_attachment]; + IntRect scissor = + IntRect{xoffset, yoffset, xoffset + width, yoffset + height}.intersection( + t.offset_bounds()); + ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0, + scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT, + color); +} + +void InvalidateFramebuffer(GLenum target, GLsizei num_attachments, + const GLenum* attachments) { + Framebuffer* fb = get_framebuffer(target); + if (!fb || num_attachments <= 0 || !attachments) { + return; + } + for (GLsizei i = 0; i < num_attachments; i++) { + switch (attachments[i]) { + case GL_DEPTH_ATTACHMENT: { + Texture& t = ctx->textures[fb->depth_attachment]; + t.set_cleared(false); + break; + } + case GL_COLOR_ATTACHMENT0: { + Texture& t = ctx->textures[fb->color_attachment]; + t.disable_delayed_clear(); + break; + } + } + } +} + +void ReadPixels(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, + GLenum type, void* data) { + data = get_pixel_pack_buffer_data(data); + if (!data) return; + Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER); + if (!fb) return; + assert(format == GL_RED || format == GL_RGBA || format == GL_RGBA_INTEGER || + format == GL_BGRA || format == GL_RG); + Texture& t = ctx->textures[fb->color_attachment]; + if (!t.buf) return; + prepare_texture(t); + // debugf("read pixels %d, %d, %d, %d from fb %d with format %x\n", x, y, + // width, height, ctx->read_framebuffer_binding, t.internal_format); + x -= t.offset.x; + y -= t.offset.y; + assert(x >= 0 && y >= 0); + assert(x + width <= t.width); + assert(y + height <= t.height); + if (internal_format_for_data(format, type) != t.internal_format) { + debugf("mismatched format for read pixels: %x vs %x\n", t.internal_format, + internal_format_for_data(format, type)); + assert(false); + return; + } + // Only support readback conversions that are reversible + assert(!format_requires_conversion(format, t.internal_format) || + bytes_for_internal_format(format) == t.bpp()); + uint8_t* dest = (uint8_t*)data; + size_t destStride = width * t.bpp(); + if (y < 0) { + dest += -y * destStride; + height += y; + y = 0; + } + if (y + height > t.height) { + height = t.height - y; + } + if (x < 0) { + dest += -x * t.bpp(); + width += x; + x = 0; + } + if (x + width > t.width) { + width = t.width - x; + } + if (width <= 0 || height <= 0) { + return; + } + convert_copy(format, t.internal_format, dest, destStride, + (const uint8_t*)t.sample_ptr(x, y), t.stride(), width, height); +} + +void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel, + GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, + GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX, + GLint dstY, GLint dstZ, GLsizei srcWidth, + GLsizei srcHeight, GLsizei srcDepth) { + assert(srcLevel == 0 && dstLevel == 0); + assert(srcZ == 0 && srcDepth == 1 && dstZ == 0); + if (srcTarget == GL_RENDERBUFFER) { + Renderbuffer& rb = ctx->renderbuffers[srcName]; + srcName = rb.texture; + } + if (dstTarget == GL_RENDERBUFFER) { + Renderbuffer& rb = ctx->renderbuffers[dstName]; + dstName = rb.texture; + } + Texture& srctex = ctx->textures[srcName]; + if (!srctex.buf) return; + prepare_texture(srctex); + Texture& dsttex = ctx->textures[dstName]; + if (!dsttex.buf) return; + assert(!dsttex.locked); + IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight}; + prepare_texture(dsttex, &skip); + assert(srctex.internal_format == dsttex.internal_format); + assert(srcWidth >= 0); + assert(srcHeight >= 0); + assert(srcX + srcWidth <= srctex.width); + assert(srcY + srcHeight <= srctex.height); + assert(dstX + srcWidth <= dsttex.width); + assert(dstY + srcHeight <= dsttex.height); + int bpp = srctex.bpp(); + int src_stride = srctex.stride(); + int dest_stride = dsttex.stride(); + char* dest = dsttex.sample_ptr(dstX, dstY); + char* src = srctex.sample_ptr(srcX, srcY); + for (int y = 0; y < srcHeight; y++) { + memcpy(dest, src, srcWidth * bpp); + dest += dest_stride; + src += src_stride; + } +} + +void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, + GLint yoffset, GLint x, GLint y, GLsizei width, + GLsizei height) { + assert(level == 0); + Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER); + if (!fb) return; + CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D, 0, x, y, 0, + ctx->get_binding(target), GL_TEXTURE_2D, 0, xoffset, yoffset, + 0, width, height, 1); +} + +} // extern "C" + +#include "blend.h" +#include "composite.h" +#include "swgl_ext.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wunused-function" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#ifdef __clang__ +# pragma GCC diagnostic ignored "-Wunused-private-field" +#else +# pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif +#include "load_shader.h" +#pragma GCC diagnostic pop + +#include "rasterize.h" + +void VertexArray::validate() { + int last_enabled = -1; + for (int i = 0; i <= max_attrib; i++) { + VertexAttrib& attr = attribs[i]; + if (attr.enabled) { + // VertexArray &v = ctx->vertex_arrays[attr.vertex_array]; + Buffer& vertex_buf = ctx->buffers[attr.vertex_buffer]; + attr.buf = vertex_buf.buf; + attr.buf_size = vertex_buf.size; + // debugf("%d %x %d %d %d %d\n", i, attr.type, attr.size, attr.stride, + // attr.offset, attr.divisor); + last_enabled = i; + } + } + max_attrib = last_enabled; +} + +extern "C" { + +void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, + GLintptr offset, GLsizei instancecount) { + if (offset < 0 || count <= 0 || instancecount <= 0 || !vertex_shader || + !fragment_shader) { + return; + } + + Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true); + if (!fb.color_attachment) { + return; + } + Texture& colortex = ctx->textures[fb.color_attachment]; + if (!colortex.buf) { + return; + } + assert(!colortex.locked); + assert(colortex.internal_format == GL_RGBA8 || + colortex.internal_format == GL_R8); + Texture& depthtex = ctx->textures[ctx->depthtest ? fb.depth_attachment : 0]; + if (depthtex.buf) { + assert(depthtex.internal_format == GL_DEPTH_COMPONENT24); + assert(colortex.width == depthtex.width && + colortex.height == depthtex.height); + assert(colortex.offset == depthtex.offset); + } + + // debugf("current_vertex_array %d\n", ctx->current_vertex_array); + // debugf("indices size: %d\n", indices_buf.size); + VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array]; + if (ctx->validate_vertex_array) { + ctx->validate_vertex_array = false; + v.validate(); + } + +#ifdef PRINT_TIMINGS + uint64_t start = get_time_value(); +#endif + + ctx->shaded_rows = 0; + ctx->shaded_pixels = 0; + + vertex_shader->init_batch(); + + switch (type) { + case GL_UNSIGNED_SHORT: + assert(mode == GL_TRIANGLES); + draw_elements(count, instancecount, offset, v, colortex, + depthtex); + break; + case GL_UNSIGNED_INT: + assert(mode == GL_TRIANGLES); + draw_elements(count, instancecount, offset, v, colortex, + depthtex); + break; + case GL_NONE: + // Non-standard GL extension - if element type is GL_NONE, then we don't + // use any element buffer and behave as if DrawArrays was called instead. + for (GLsizei instance = 0; instance < instancecount; instance++) { + switch (mode) { + case GL_LINES: + for (GLsizei i = 0; i + 2 <= count; i += 2) { + vertex_shader->load_attribs(v.attribs, offset + i, instance, 2); + draw_quad(2, colortex, depthtex); + } + break; + case GL_TRIANGLES: + for (GLsizei i = 0; i + 3 <= count; i += 3) { + vertex_shader->load_attribs(v.attribs, offset + i, instance, 3); + draw_quad(3, colortex, depthtex); + } + break; + default: + assert(false); + break; + } + } + break; + default: + assert(false); + break; + } + + if (ctx->samples_passed_query) { + Query& q = ctx->queries[ctx->samples_passed_query]; + q.value += ctx->shaded_pixels; + } + +#ifdef PRINT_TIMINGS + uint64_t end = get_time_value(); + printf( + "%7.3fms draw(%s, %d): %d pixels in %d rows (avg %f pixels/row, " + "%fns/pixel)\n", + double(end - start) / (1000. * 1000.), + ctx->programs[ctx->current_program].impl->get_name(), instancecount, + ctx->shaded_pixels, ctx->shaded_rows, + double(ctx->shaded_pixels) / ctx->shaded_rows, + double(end - start) / max(ctx->shaded_pixels, 1)); +#endif +} + +void Finish() { +#ifdef PRINT_TIMINGS + printf("Finish\n"); +#endif +} + +void MakeCurrent(Context* c) { + if (ctx == c) { + return; + } + ctx = c; + setup_program(ctx ? ctx->current_program : 0); +} + +Context* CreateContext() { return new Context; } + +void ReferenceContext(Context* c) { + if (!c) { + return; + } + ++c->references; +} + +void DestroyContext(Context* c) { + if (!c) { + return; + } + assert(c->references > 0); + --c->references; + if (c->references > 0) { + return; + } + if (ctx == c) { + MakeCurrent(nullptr); + } + delete c; +} + +size_t ReportMemory(Context* ctx, size_t (*size_of_op)(const void*)) { + size_t size = 0; + if (ctx) { + for (auto& t : ctx->textures) { + if (t && t->should_free()) { + size += size_of_op(t->buf); + } + } + } + return size; +} +} // extern "C" diff --git a/gfx/wr/swgl/src/gl_defs.h b/gfx/wr/swgl/src/gl_defs.h new file mode 100644 index 0000000000..b60eaad0ec --- /dev/null +++ b/gfx/wr/swgl/src/gl_defs.h @@ -0,0 +1,220 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +typedef int8_t GLbyte; +typedef uint8_t GLubyte; +typedef int16_t GLshort; +typedef uint16_t GLushort; +typedef int32_t GLint; +typedef uint32_t GLuint; +typedef int64_t GLint64; +typedef uint64_t GLuint64; + +typedef float GLfloat; +typedef double GLdouble; + +typedef uint32_t GLenum; +typedef uint8_t GLboolean; +typedef uint32_t GLbitfield; + +typedef int32_t GLsizei; +typedef size_t GLsizeiptr; +typedef intptr_t GLintptr; + +#define GL_FALSE 0 +#define GL_TRUE 1 + +#define GL_NONE 0 + +#define GL_NO_ERROR 0 +#define GL_OUT_OF_MEMORY 0x0505 + +#define GL_RGBA32F 0x8814 +#define GL_RGBA8 0x8058 +#define GL_R8 0x8229 +#define GL_R16 0x822A +#define GL_RG16 0x822C +#define GL_RGBA32I 0x8D82 +#define GL_BGRA8 0x93A1 +#define GL_RG8 0x822B + +#define GL_BYTE 0x1400 +#define GL_UNSIGNED_BYTE 0x1401 +#define GL_SHORT 0x1402 +#define GL_UNSIGNED_SHORT 0x1403 +#define GL_INT 0x1404 +#define GL_UNSIGNED_INT 0x1405 +#define GL_FLOAT 0x1406 +#define GL_DOUBLE 0x1408 + +#define GL_RED 0x1903 +#define GL_GREEN 0x1904 +#define GL_BLUE 0x1905 +#define GL_ALPHA 0x1906 +#define GL_RGB 0x1907 +#define GL_RGBA 0x1908 +#define GL_RGBA_INTEGER 0x8D99 +#define GL_BGRA 0x80E1 +#define GL_RG 0x8227 + +#define GL_DEPTH_COMPONENT 0x1902 +#define GL_DEPTH_COMPONENT16 0x81A5 +#define GL_DEPTH_COMPONENT24 0x81A6 +#define GL_DEPTH_COMPONENT32 0x81A7 + +#define GL_ARRAY_BUFFER 0x8892 +#define GL_ELEMENT_ARRAY_BUFFER 0x8893 + +#define GL_READ_FRAMEBUFFER 0x8CA8 +#define GL_DRAW_FRAMEBUFFER 0x8CA9 +#define GL_FRAMEBUFFER 0x8D40 +#define GL_DRAW_FRAMEBUFFER_BINDING 0x8CA6 +#define GL_READ_FRAMEBUFFER_BINDING 0x8CAA +#define GL_RENDERBUFFER 0x8D41 +#define GL_COLOR_ATTACHMENT0 0x8CE0 +#define GL_DEPTH_ATTACHMENT 0x8D00 +#define GL_STENCIL_ATTACHMENT 0x8D20 +#define GL_FRAMEBUFFER_COMPLETE 0x8CD5 +#define GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT 0x8CD6 +#define GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT 0x8CD7 +#define GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER 0x8CDB +#define GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER 0x8CDC +#define GL_FRAMEBUFFER_UNSUPPORTED 0x8CDD +#define GL_COLOR_BUFFER_BIT 0x00004000 +#define GL_DEPTH_BUFFER_BIT 0x00000100 +#define GL_STENCIL_BUFFER_BIT 0x00000400 + +#define GL_PIXEL_PACK_BUFFER 0x88EB +#define GL_PIXEL_UNPACK_BUFFER 0x88EC +#define GL_PIXEL_PACK_BUFFER_BINDING 0x88ED +#define GL_PIXEL_UNPACK_BUFFER_BINDING 0x88EF +#define GL_UNPACK_ROW_LENGTH 0x0CF2 +#define GL_UNPACK_ALIGNMENT 0x0CF5 + +#define GL_QUERY_RESULT 0x8866 +#define GL_QUERY_RESULT_AVAILABLE 0x8867 +#define GL_TIME_ELAPSED 0x88BF +#define GL_SAMPLES_PASSED 0x8914 + +#define GL_NEAREST 0x2600 +#define GL_LINEAR 0x2601 +#define GL_NEAREST_MIPMAP_NEAREST 0x2700 +#define GL_NEAREST_MIPMAP_LINEAR 0x2702 +#define GL_LINEAR_MIPMAP_NEAREST 0x2701 +#define GL_LINEAR_MIPMAP_LINEAR 0x2703 +#define GL_TEXTURE_WRAP_S 0x2802 +#define GL_TEXTURE_WRAP_T 0x2803 +#define GL_TEXTURE_MAG_FILTER 0x2800 +#define GL_TEXTURE_MIN_FILTER 0x2801 +#define GL_CLAMP_TO_EDGE 0x812F +#define GL_TEXTURE_2D 0x0DE1 +#define GL_TEXTURE_3D 0x806F +#define GL_TEXTURE_2D_ARRAY 0x8C1A +#define GL_TEXTURE_RECTANGLE 0x84F5 +#define GL_TEXTURE0 0x84C0 +#define GL_TEXTURE1 0x84C1 +#define GL_TEXTURE2 0x84C2 +#define GL_TEXTURE3 0x84C3 +#define GL_TEXTURE4 0x84C4 +#define GL_TEXTURE5 0x84C5 +#define GL_TEXTURE6 0x84C6 +#define GL_TEXTURE7 0x84C7 +#define GL_TEXTURE8 0x84C8 +#define GL_TEXTURE9 0x84C9 +#define GL_TEXTURE10 0x84CA +#define GL_TEXTURE11 0x84CB +#define GL_TEXTURE12 0x84CC +#define GL_TEXTURE13 0x84CD +#define GL_TEXTURE14 0x84CE +#define GL_TEXTURE15 0x84CF +#define GL_MAX_TEXTURE_UNITS 0x84E2 +#define GL_MAX_TEXTURE_IMAGE_UNITS 0x8872 +#define GL_MAX_TEXTURE_SIZE 0x0D33 +#define GL_MAX_ARRAY_TEXTURE_LAYERS 0x88FF +#define GL_MIN_PROGRAM_TEXEL_OFFSET 0x8904 +#define GL_MAX_PROGRAM_TEXEL_OFFSET 0x8905 + +#define GL_VERTEX_SHADER 0x8B31 +#define GL_FRAGMENT_SHADER 0x8B30 + +#define GL_BLEND 0x0BE2 +#define GL_ZERO 0 +#define GL_ONE 1 +#define GL_SRC_COLOR 0x0300 +#define GL_ONE_MINUS_SRC_COLOR 0x0301 +#define GL_SRC_ALPHA 0x0302 +#define GL_ONE_MINUS_SRC_ALPHA 0x0303 +#define GL_DST_ALPHA 0x0304 +#define GL_ONE_MINUS_DST_ALPHA 0x0305 +#define GL_DST_COLOR 0x0306 +#define GL_ONE_MINUS_DST_COLOR 0x0307 +#define GL_CONSTANT_COLOR 0x8001 +#define GL_ONE_MINUS_CONSTANT_COLOR 0x8002 +#define GL_CONSTANT_ALPHA 0x8003 +#define GL_ONE_MINUS_CONSTANT_ALPHA 0x8004 +#define GL_SRC1_ALPHA 0x8589 +#define GL_SRC1_COLOR 0x88F9 +#define GL_ONE_MINUS_SRC1_COLOR 0x88FA +#define GL_ONE_MINUS_SRC1_ALPHA 0x88FB + +#define GL_FUNC_ADD 0x8006 +#define GL_MIN 0x8007 +#define GL_MAX 0x8008 + +#define GL_NEVER 0x0200 +#define GL_LESS 0x0201 +#define GL_EQUAL 0x0202 +#define GL_LEQUAL 0x0203 +#define GL_GREATER 0x0204 +#define GL_NOTEQUAL 0x0205 +#define GL_GEQUAL 0x0206 +#define GL_ALWAYS 0x0207 +#define GL_DEPTH_TEST 0x0B71 +#define GL_DEPTH_WRITEMASK 0x0B72 + +#define GL_SCISSOR_TEST 0x0C11 + +#define GL_VENDOR 0x1F00 +#define GL_RENDERER 0x1F01 +#define GL_VERSION 0x1F02 +#define GL_EXTENSIONS 0x1F03 +#define GL_NUM_EXTENSIONS 0x821D +#define GL_MINOR_VERSION 0x821C +#define GL_MAJOR_VERSION 0x821B +#define GL_SHADING_LANGUAGE_VERSION 0x8B8C + +#define GL_POINTS 0x0000 +#define GL_LINES 0x0001 +#define GL_LINE_LOOP 0x0002 +#define GL_LINE_STRIP 0x0003 +#define GL_TRIANGLES 0x0004 +#define GL_TRIANGLE_STRIP 0x0005 +#define GL_TRIANGLE_FAN 0x0006 +#define GL_QUADS 0x0007 + +#define GL_UNSIGNED_INT_8_8_8_8_REV 0x8367 + +#define GL_RGB_422_APPLE 0x8A1F +#define GL_UNSIGNED_SHORT_8_8_APPLE 0x85BA +#define GL_UNSIGNED_SHORT_8_8_REV_APPLE 0x85BB +#define GL_RGB_RAW_422_APPLE 0x8A51 + +#define GL_MULTIPLY_KHR 0x9294 +#define GL_SCREEN_KHR 0x9295 +#define GL_OVERLAY_KHR 0x9296 +#define GL_DARKEN_KHR 0x9297 +#define GL_LIGHTEN_KHR 0x9298 +#define GL_COLORDODGE_KHR 0x9299 +#define GL_COLORBURN_KHR 0x929A +#define GL_HARDLIGHT_KHR 0x929B +#define GL_SOFTLIGHT_KHR 0x929C +#define GL_DIFFERENCE_KHR 0x929E +#define GL_EXCLUSION_KHR 0x92A0 +#define GL_HSL_HUE_KHR 0x92AD +#define GL_HSL_SATURATION_KHR 0x92AE +#define GL_HSL_COLOR_KHR 0x92AF +#define GL_HSL_LUMINOSITY_KHR 0x92B0 + +#define SWGL_BLEND_DROP_SHADOW 0xB001 +#define SWGL_BLEND_SUBPIXEL_TEXT 0xB002 diff --git a/gfx/wr/swgl/src/glsl.h b/gfx/wr/swgl/src/glsl.h new file mode 100644 index 0000000000..9193c72424 --- /dev/null +++ b/gfx/wr/swgl/src/glsl.h @@ -0,0 +1,3119 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#define SI ALWAYS_INLINE static + +#include "vector_type.h" + +namespace glsl { + +enum TextureFormat { RGBA32F, RGBA32I, RGBA8, R8, RG8, R16, RG16, YUV422 }; + +enum TextureFilter { NEAREST, LINEAR }; + +struct samplerCommon { + uint32_t* buf = nullptr; + uint32_t stride = 0; // in units of BPP if < 4, or dwords if BPP >= 4 + uint32_t height = 0; + uint32_t width = 0; + TextureFormat format = TextureFormat::RGBA8; +}; + +struct samplerFilter { + TextureFilter filter = TextureFilter::NEAREST; +}; + +struct sampler2D_impl : samplerCommon, samplerFilter {}; +typedef sampler2D_impl* sampler2D; + +typedef struct sampler2DR8_impl : sampler2D_impl{} * sampler2DR8; +typedef struct sampler2DRG8_impl : sampler2D_impl{} * sampler2DRG8; +typedef struct sampler2DRGBA8_impl : sampler2D_impl{} * sampler2DRGBA8; +typedef struct sampler2DRGBA32F_impl : sampler2D_impl{} * sampler2DRGBA32F; + +struct isampler2D_impl : samplerCommon {}; +typedef isampler2D_impl* isampler2D; + +struct isampler2DRGBA32I_impl : isampler2D_impl {}; +typedef isampler2DRGBA32I_impl* isampler2DRGBA32I; + +struct sampler2DRect_impl : samplerCommon, samplerFilter {}; +typedef sampler2DRect_impl* sampler2DRect; + +#if USE_SSE2 +SI bool test_all(Bool cond) { return _mm_movemask_ps(cond) == 0xF; } +SI bool test_any(Bool cond) { return _mm_movemask_ps(cond) != 0; } +SI bool test_none(Bool cond) { return _mm_movemask_ps(cond) == 0; } +#else +SI bool test_all(Bool cond) { + return bit_cast(CONVERT(cond, U8)) == 0xFFFFFFFFU; +} +SI bool test_any(Bool cond) { + return bit_cast(CONVERT(cond, U8)) != 0; +} +SI bool test_none(Bool cond) { + return bit_cast(CONVERT(cond, U8)) == 0; +} +#endif +SI bool test_equal(Bool cond) { return test_none(cond != cond.x); } + +float make_float(float n) { return n; } + +float make_float(int32_t n) { return float(n); } + +float make_float(uint32_t n) { return float(n); } + +float make_float(bool n) { return float(n); } + +template +Float make_float(T v) { + return CONVERT(v, Float); +} + +int32_t make_int(uint32_t n) { return n; } + +int32_t make_int(int32_t n) { return n; } + +int32_t make_int(float n) { return int32_t(n); } + +int32_t make_int(bool n) { return int32_t(n); } + +template +I32 make_int(T v) { + return CONVERT(v, I32); +} + +uint32_t make_uint(uint32_t n) { return n; } + +uint32_t make_uint(int32_t n) { return n; } + +uint32_t make_uint(float n) { return uint32_t(n); } + +uint32_t make_uint(bool n) { return uint32_t(n); } + +template +U32 make_uint(T v) { + return CONVERT(v, U32); +} + +template +T force_scalar(T n) { + return n; +} + +float force_scalar(Float f) { return f[0]; } + +int32_t force_scalar(I32 i) { return i[0]; } + +struct vec4; +struct ivec2; + +SI int32_t if_then_else(int32_t c, int32_t t, int32_t e) { return c ? t : e; } +SI int32_t if_then_else(bool c, int32_t t, int32_t e) { return c ? t : e; } + +SI float if_then_else(int32_t c, float t, float e) { return c ? t : e; } + +SI Float if_then_else(I32 c, float t, float e) { + return bit_cast((c & bit_cast(Float(t))) | + (~c & bit_cast(Float(e)))); +} + +SI I32 if_then_else(I32 c, int32_t t, int32_t e) { + return (c & I32(t)) | (~c & I32(e)); +} + +SI U32 if_then_else(I32 c, U32 t, U32 e) { + return bit_cast((c & bit_cast(t)) | (~c & bit_cast(e))); +} + +// Cheaper version of if_then_else that returns Float(0) if condition is false. +SI Float if_then(I32 c, Float t) { + return bit_cast(c & bit_cast(t)); +} + +SI Float if_then_else(I32 c, Float t, Float e) { + return bit_cast((c & bit_cast(t)) | (~c & bit_cast(e))); +} + +SI Float if_then_else(int32_t c, Float t, Float e) { return c ? t : e; } + +SI Bool if_then_else(I32 c, Bool t, Bool e) { return (c & t) | (~c & e); } + +SI Bool if_then_else(int32_t c, Bool t, Bool e) { return c ? t : e; } + +SI I16 if_then_else(I16 c, I16 t, I16 e) { return (c & t) | (~c & e); } + +template +SI void swap(T& a, T& b) { + T t(a); + a = b; + b = t; +} + +SI int32_t min(int32_t a, int32_t b) { return a < b ? a : b; } +SI int32_t max(int32_t a, int32_t b) { return a > b ? a : b; } + +SI int32_t clamp(int32_t a, int32_t minVal, int32_t maxVal) { + return min(max(a, minVal), maxVal); +} + +SI float min(float a, float b) { return a < b ? a : b; } +SI float max(float a, float b) { return a > b ? a : b; } + +SI float clamp(float a, float minVal, float maxVal) { + return min(max(a, minVal), maxVal); +} + +SI Float min(Float a, Float b) { +#if USE_SSE2 + return _mm_min_ps(a, b); +#elif USE_NEON + return vminq_f32(a, b); +#else + return if_then_else(a < b, a, b); +#endif +} + +SI Float max(Float a, Float b) { +#if USE_SSE2 + return _mm_max_ps(a, b); +#elif USE_NEON + return vmaxq_f32(a, b); +#else + return if_then_else(a > b, a, b); +#endif +} + +SI Float clamp(Float a, Float minVal, Float maxVal) { + return min(max(a, minVal), maxVal); +} + +#define sqrt __glsl_sqrt + +SI float sqrt(float x) { return sqrtf(x); } + +SI Float sqrt(Float v) { +#if USE_SSE2 + return _mm_sqrt_ps(v); +#elif USE_NEON + Float e = vrsqrteq_f32(v); + e *= vrsqrtsq_f32(v, e * e); + e *= vrsqrtsq_f32(v, e * e); + return if_then(v != Float(0.0f), v * e); +#else + return (Float){sqrtf(v.x), sqrtf(v.y), sqrtf(v.z), sqrtf(v.w)}; +#endif +} + +SI float recip(float x) { +#if USE_SSE2 + return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x))); +#else + return 1.0f / x; +#endif +} + +// Use a fast vector reciprocal approximation when available. This should only +// be used in cases where it is okay that the approximation is imprecise - +// essentially visually correct but numerically wrong. Otherwise just rely on +// however the compiler would implement slower division if the platform doesn't +// provide a convenient intrinsic. +SI Float recip(Float v) { +#if USE_SSE2 + return _mm_rcp_ps(v); +#elif USE_NEON + Float e = vrecpeq_f32(v); + return vrecpsq_f32(v, e) * e; +#else + return 1.0f / v; +#endif +} + +SI float inversesqrt(float x) { +#if USE_SSE2 + return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x))); +#else + return 1.0f / sqrtf(x); +#endif +} + +SI Float inversesqrt(Float v) { +#if USE_SSE2 + return _mm_rsqrt_ps(v); +#elif USE_NEON + Float e = vrsqrteq_f32(v); + return vrsqrtsq_f32(v, e * e) * e; +#else + return 1.0f / sqrt(v); +#endif +} + +SI float step(float edge, float x) { return float(x >= edge); } + +SI Float step(Float edge, Float x) { return if_then(x >= edge, Float(1)); } + +/* +enum RGBA { + R, + G, + B, + A +};*/ + +enum XYZW { + X = 0, + Y = 1, + Z = 2, + W = 3, + R = 0, + G = 1, + B = 2, + A = 3, +}; + +struct bvec4_scalar; + +struct bvec2_scalar { + bool x; + bool y; + + bvec2_scalar() : bvec2_scalar(false) {} + IMPLICIT constexpr bvec2_scalar(bool a) : x(a), y(a) {} + constexpr bvec2_scalar(bool x, bool y) : x(x), y(y) {} + + bool& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + default: + UNREACHABLE; + } + } + bool sel(XYZW c1) { return select(c1); } + + bvec2_scalar sel(XYZW c1, XYZW c2) { + return bvec2_scalar(select(c1), select(c2)); + } + bvec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4); +}; + +struct bvec2_scalar1 { + bool x; + + IMPLICIT constexpr bvec2_scalar1(bool a) : x(a) {} + + operator bvec2_scalar() const { return bvec2_scalar(x); } +}; + +struct bvec2 { + bvec2() : bvec2(0) {} + IMPLICIT bvec2(Bool a) : x(a), y(a) {} + bvec2(Bool x, Bool y) : x(x), y(y) {} + Bool& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + default: + UNREACHABLE; + } + } + Bool sel(XYZW c1) { return select(c1); } + + bvec2 sel(XYZW c1, XYZW c2) { return bvec2(select(c1), select(c2)); } + + bvec2 operator~() { return bvec2(~x, ~y); } + + Bool x; + Bool y; +}; + +bvec2_scalar1 make_bvec2(bool n) { return bvec2_scalar1(n); } + +bvec2_scalar make_bvec2(bool x, bool y) { return bvec2_scalar{x, y}; } + +template +bvec2 make_bvec2(const N& n) { + return bvec2(n); +} + +template +bvec2 make_bvec2(const X& x, const Y& y) { + return bvec2(x, y); +} + +struct vec3_scalar; +struct vec4_scalar; + +struct vec2_scalar { + typedef struct vec2 vector_type; + typedef float element_type; + + float x; + float y; + + constexpr vec2_scalar() : vec2_scalar(0.0f) {} + IMPLICIT constexpr vec2_scalar(float a) : x(a), y(a) {} + IMPLICIT constexpr vec2_scalar(int a) : x(a), y(a) {} + constexpr vec2_scalar(float x, float y) : x(x), y(y) {} + + float& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + default: + UNREACHABLE; + } + } + float& sel(XYZW c1) { return select(c1); } + vec2_scalar sel(XYZW c1, XYZW c2) { + return vec2_scalar(select(c1), select(c2)); + } + vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3); + vec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4); + + friend bool operator==(const vec2_scalar& l, const vec2_scalar& r) { + return l.x == r.x && l.y == r.y; + } + + friend bool operator!=(const vec2_scalar& l, const vec2_scalar& r) { + return l.x != r.x || l.y != r.y; + } + + friend vec2_scalar operator*(float a, vec2_scalar b) { + return vec2_scalar(a * b.x, a * b.y); + } + friend vec2_scalar operator*(vec2_scalar a, float b) { + return vec2_scalar(a.x * b, a.y * b); + } + friend vec2_scalar operator*(vec2_scalar a, vec2_scalar b) { + return vec2_scalar(a.x * b.x, a.y * b.y); + } + friend vec2_scalar operator/(vec2_scalar a, float b) { + return vec2_scalar(a.x / b, a.y / b); + } + friend vec2_scalar operator/(vec2_scalar a, vec2_scalar b) { + return vec2_scalar(a.x / b.x, a.y / b.y); + } + + friend vec2_scalar operator-(vec2_scalar a, vec2_scalar b) { + return vec2_scalar(a.x - b.x, a.y - b.y); + } + friend vec2_scalar operator-(vec2_scalar a, float b) { + return vec2_scalar(a.x - b, a.y - b); + } + friend vec2_scalar operator-(float a, vec2_scalar b) { + return vec2_scalar(a - b.x, a - b.y); + } + friend vec2_scalar operator+(vec2_scalar a, vec2_scalar b) { + return vec2_scalar(a.x + b.x, a.y + b.y); + } + friend vec2_scalar operator+(vec2_scalar a, float b) { + return vec2_scalar(a.x + b, a.y + b); + } + + vec2_scalar operator-() { return vec2_scalar(-x, -y); } + + vec2_scalar operator*=(vec2_scalar a) { + x *= a.x; + y *= a.y; + return *this; + } + + vec2_scalar operator/=(vec2_scalar a) { + x /= a.x; + y /= a.y; + return *this; + } + + vec2_scalar operator+=(vec2_scalar a) { + x += a.x; + y += a.y; + return *this; + } + + vec2_scalar operator-=(vec2_scalar a) { + x -= a.x; + y -= a.y; + return *this; + } +}; + +struct vec2_scalar_ref { + vec2_scalar_ref(float& x, float& y) : x(x), y(y) {} + float& x; + float& y; + + float& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + default: + UNREACHABLE; + } + } + float& sel(XYZW c1) { return select(c1); } + + vec2_scalar_ref& operator=(const vec2_scalar& a) { + x = a.x; + y = a.y; + return *this; + } + vec2_scalar_ref& operator*=(vec2_scalar a) { + x *= a.x; + y *= a.y; + return *this; + } + operator vec2_scalar() const { return vec2_scalar{x, y}; } +}; + +struct vec2 { + typedef struct vec2 vector_type; + typedef float element_type; + + constexpr vec2() : vec2(Float(0.0f)) {} + IMPLICIT constexpr vec2(Float a) : x(a), y(a) {} + vec2(Float x, Float y) : x(x), y(y) {} + IMPLICIT constexpr vec2(vec2_scalar s) : x(s.x), y(s.y) {} + constexpr vec2(vec2_scalar s0, vec2_scalar s1, vec2_scalar s2, vec2_scalar s3) + : x(Float{s0.x, s1.x, s2.x, s3.x}), y(Float{s0.y, s1.y, s2.y, s3.y}) {} + explicit vec2(ivec2 a); + Float x; + Float y; + + Float& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + default: + UNREACHABLE; + } + } + Float& sel(XYZW c1) { return select(c1); } + vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); } + + vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4); + + vec2 operator*=(Float a) { + x *= a; + y *= a; + return *this; + } + vec2 operator*=(vec2 a) { + x *= a.x; + y *= a.y; + return *this; + } + + vec2 operator/=(Float a) { + x /= a; + y /= a; + return *this; + } + vec2 operator/=(vec2 a) { + x /= a.x; + y /= a.y; + return *this; + } + + vec2 operator+=(vec2 a) { + x += a.x; + y += a.y; + return *this; + } + vec2 operator-=(vec2 a) { + x -= a.x; + y -= a.y; + return *this; + } + vec2 operator-=(Float a) { + x -= a; + y -= a; + return *this; + } + + vec2 operator-() { return vec2(-x, -y); } + + friend I32 operator==(const vec2& l, const vec2& r) { + return l.x == r.x && l.y == r.y; + } + + friend I32 operator!=(const vec2& l, const vec2& r) { + return l.x != r.x || l.y != r.y; + } + + friend vec2 operator*(vec2 a, Float b) { return vec2(a.x * b, a.y * b); } + friend vec2 operator*(vec2 a, vec2 b) { return vec2(a.x * b.x, a.y * b.y); } + friend vec2 operator*(Float a, vec2 b) { return vec2(a * b.x, a * b.y); } + + friend vec2 operator/(vec2 a, vec2 b) { return vec2(a.x / b.x, a.y / b.y); } + friend vec2 operator/(vec2 a, Float b) { return vec2(a.x / b, a.y / b); } + + friend vec2 operator-(vec2 a, vec2 b) { return vec2(a.x - b.x, a.y - b.y); } + friend vec2 operator-(vec2 a, Float b) { return vec2(a.x - b, a.y - b); } + friend vec2 operator-(Float a, vec2 b) { return vec2(a - b.x, a - b.y); } + friend vec2 operator+(vec2 a, vec2 b) { return vec2(a.x + b.x, a.y + b.y); } + friend vec2 operator+(vec2 a, Float b) { return vec2(a.x + b, a.y + b); } + friend vec2 operator+(Float a, vec2 b) { return vec2(a + b.x, a + b.y); } +}; + +vec2_scalar force_scalar(const vec2& v) { + return vec2_scalar{force_scalar(v.x), force_scalar(v.y)}; +} + +vec2_scalar make_vec2(float n) { return vec2_scalar{n, n}; } + +vec2_scalar make_vec2(float x, float y) { return vec2_scalar{x, y}; } + +vec2_scalar make_vec2(int32_t x, int32_t y) { + return vec2_scalar{float(x), float(y)}; +} + +template +vec2 make_vec2(const N& n) { + return vec2(n); +} + +template +vec2 make_vec2(const X& x, const Y& y) { + return vec2(x, y); +} + +vec2 operator*(vec2_scalar a, Float b) { return vec2(a.x * b, a.y * b); } + +vec2 operator*(Float a, vec2_scalar b) { return vec2(a * b.x, a * b.y); } + +SI vec2 min(vec2 a, vec2 b) { return vec2(min(a.x, b.x), min(a.y, b.y)); } +SI vec2 min(vec2 a, Float b) { return vec2(min(a.x, b), min(a.y, b)); } + +SI vec2_scalar min(vec2_scalar a, vec2_scalar b) { + return vec2_scalar{min(a.x, b.x), min(a.y, b.y)}; +} + +SI vec2 if_then_else(I32 c, vec2 t, vec2 e) { + return vec2(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y)); +} + +SI vec2 if_then_else(int32_t c, vec2 t, vec2 e) { return c ? t : e; } + +vec2 step(vec2 edge, vec2 x) { + return vec2(step(edge.x, x.x), step(edge.y, x.y)); +} + +vec2_scalar step(vec2_scalar edge, vec2_scalar x) { + return vec2_scalar(step(edge.x, x.x), step(edge.y, x.y)); +} + +SI vec2 max(vec2 a, vec2 b) { return vec2(max(a.x, b.x), max(a.y, b.y)); } +SI vec2 max(vec2 a, Float b) { return vec2(max(a.x, b), max(a.y, b)); } + +SI vec2_scalar max(vec2_scalar a, vec2_scalar b) { + return vec2_scalar{max(a.x, b.x), max(a.y, b.y)}; +} +SI vec2_scalar max(vec2_scalar a, float b) { + return vec2_scalar{max(a.x, b), max(a.y, b)}; +} + +Float length(vec2 a) { return sqrt(a.x * a.x + a.y * a.y); } + +float length(vec2_scalar a) { return hypotf(a.x, a.y); } + +template +SI auto distance(A a, B b) { + return length(a - b); +} + +template +SI T normalize(T a) { + return a / length(a); +} + +SI vec2 sqrt(vec2 a) { return vec2(sqrt(a.x), sqrt(a.y)); } + +SI vec2_scalar sqrt(vec2_scalar a) { return vec2_scalar(sqrt(a.x), sqrt(a.y)); } + +SI vec2 recip(vec2 a) { return vec2(recip(a.x), recip(a.y)); } + +SI vec2_scalar recip(vec2_scalar a) { + return vec2_scalar(recip(a.x), recip(a.y)); +} + +SI vec2 inversesqrt(vec2 a) { return vec2(inversesqrt(a.x), inversesqrt(a.y)); } + +SI vec2_scalar inversesqrt(vec2_scalar a) { + return vec2_scalar(inversesqrt(a.x), inversesqrt(a.y)); +} + +#define abs __glsl_abs + +int32_t abs(int32_t a) { return a < 0 ? -a : a; } + +float abs(float a) { return fabsf(a); } + +Float abs(Float v) { +#if USE_NEON + return vabsq_f32(v); +#else + return bit_cast(bit_cast(v) & bit_cast(0.0f - v)); +#endif +} + +float sign(float a) { return copysignf(1.0f, a); } + +Float sign(Float v) { + return bit_cast((bit_cast(v) & 0x80000000) | + bit_cast(Float(1.0f))); +} + +Float cast(U32 v) { return CONVERT((I32)v, Float); } +Float cast(I32 v) { return CONVERT((I32)v, Float); } +I32 cast(Float v) { return CONVERT(v, I32); } + +#define floor __glsl_floor + +float floor(float a) { return floorf(a); } + +Float floor(Float v) { + Float roundtrip = cast(cast(v)); + return roundtrip - if_then(roundtrip > v, Float(1)); +} + +vec2 floor(vec2 v) { return vec2(floor(v.x), floor(v.y)); } + +vec2_scalar floor(vec2_scalar v) { + return vec2_scalar{floorf(v.x), floorf(v.y)}; +} + +#define ceil __glsl_ceil + +float ceil(float a) { return ceilf(a); } + +Float ceil(Float v) { + Float roundtrip = cast(cast(v)); + return roundtrip + if_then(roundtrip < v, Float(1)); +} + +// Round to nearest even +SI int32_t roundeven(float v, float scale) { +#if USE_SSE2 + return _mm_cvtss_si32(_mm_set_ss(v * scale)); +#else + return bit_cast(v * scale + float(0xC00000)) - 0x4B400000; +#endif +} + +SI I32 roundeven(Float v, Float scale) { +#if USE_SSE2 + return _mm_cvtps_epi32(v * scale); +#else + // Magic number implementation of round-to-nearest-even + // see http://stereopsis.com/sree/fpu2006.html + return bit_cast(v * scale + Float(0xC00000)) - 0x4B400000; +#endif +} + +// Round towards zero +SI int32_t roundzero(float v, float scale) { return int32_t(v * scale); } + +SI I32 roundzero(Float v, Float scale) { return cast(v * scale); } + +// Round whichever direction is fastest for positive numbers +SI I32 roundfast(Float v, Float scale) { +#if USE_SSE2 + return _mm_cvtps_epi32(v * scale); +#else + return cast(v * scale + 0.5f); +#endif +} + +template +SI auto round_pixel(T v, float scale = 255.0f) { + return roundfast(v, scale); +} + +#define round __glsl_round + +float round(float a) { return roundf(a); } + +Float round(Float v) { return floor(v + 0.5f); } + +float fract(float a) { return a - floor(a); } + +Float fract(Float v) { return v - floor(v); } + +vec2 fract(vec2 v) { return vec2(fract(v.x), fract(v.y)); } + +vec2_scalar fract(vec2_scalar v) { return vec2_scalar(fract(v.x), fract(v.y)); } + +// X derivatives can be approximated by dFdx(x) = x[1] - x[0]. +// Y derivatives are not easily available since we operate in terms of X spans +// only. To work around, assume dFdy(p.x) = dFdx(p.y), which only holds for +// uniform scaling, and thus abs(dFdx(p.x)) + abs(dFdy(p.x)) = abs(dFdx(p.x)) + +// abs(dFdx(p.y)) which mirrors abs(dFdx(p.y)) + abs(dFdy(p.y)) = abs(dFdx(p.y)) +// + abs(dFdx(p.x)). +vec2_scalar fwidth(vec2 p) { + Float d = abs(SHUFFLE(p.x, p.y, 1, 1, 5, 5) - SHUFFLE(p.x, p.y, 0, 0, 4, 4)); + return vec2_scalar(d.x + d.z); +} + +float dFdx(Float x) { return x.y - x.x; } + +vec2_scalar dFdx(vec2 p) { return vec2_scalar(dFdx(p.x), dFdx(p.y)); } + +// See +// http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. +Float approx_log2(Float x) { + // e - 127 is a fair approximation of log2(x) in its own right... + Float e = cast(bit_cast(x)) * (1.0f / (1 << 23)); + + // ... but using the mantissa to refine its error is _much_ better. + Float m = bit_cast((bit_cast(x) & 0x007fffff) | 0x3f000000); + return e - 124.225514990f - 1.498030302f * m - + 1.725879990f / (0.3520887068f + m); +} + +Float approx_pow2(Float x) { + Float f = fract(x); + return bit_cast( + roundfast(1.0f * (1 << 23), x + 121.274057500f - 1.490129070f * f + + 27.728023300f / (4.84252568f - f))); +} + +#define pow __glsl_pow + +SI float pow(float x, float y) { return powf(x, y); } + +Float pow(Float x, Float y) { + return if_then_else((x == 0) | (x == 1), x, approx_pow2(approx_log2(x) * y)); +} + +#define exp __glsl_exp + +SI float exp(float x) { return expf(x); } + +Float exp(Float y) { + float l2e = 1.4426950408889634074f; + return approx_pow2(l2e * y); +} + +#define exp2 __glsl_exp2 + +SI float exp2(float x) { return exp2f(x); } + +Float exp2(Float x) { return approx_pow2(x); } + +#define log __glsl_log + +SI float log(float x) { return logf(x); } + +Float log(Float x) { return approx_log2(x) * 0.69314718f; } + +#define log2 __glsl_log2 + +SI float log2(float x) { return log2f(x); } + +Float log2(Float x) { return approx_log2(x); } + +struct ivec4; + +struct ivec2_scalar { + typedef int32_t element_type; + + int32_t x; + int32_t y; + + ivec2_scalar() : ivec2_scalar(0) {} + IMPLICIT constexpr ivec2_scalar(int32_t a) : x(a), y(a) {} + constexpr ivec2_scalar(int32_t x, int32_t y) : x(x), y(y) {} + + int32_t& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + default: + UNREACHABLE; + } + } + int32_t& sel(XYZW c1) { return select(c1); } + ivec2_scalar sel(XYZW c1, XYZW c2) { + return ivec2_scalar{select(c1), select(c2)}; + } + + ivec2_scalar operator-() const { return ivec2_scalar{-x, -y}; } + + ivec2_scalar& operator+=(ivec2_scalar a) { + x += a.x; + y += a.y; + return *this; + } + ivec2_scalar& operator+=(int n) { + x += n; + y += n; + return *this; + } + + ivec2_scalar& operator>>=(int shift) { + x >>= shift; + y >>= shift; + return *this; + } + + friend ivec2_scalar operator&(ivec2_scalar a, int b) { + return ivec2_scalar{a.x & b, a.y & b}; + } + + friend ivec2_scalar operator+(ivec2_scalar a, ivec2_scalar b) { + return ivec2_scalar{a.x + b.x, a.y + b.y}; + } + friend ivec2_scalar operator+(ivec2_scalar a, int b) { + return ivec2_scalar{a.x + b, a.y + b}; + } + + friend ivec2_scalar operator-(ivec2_scalar a, ivec2_scalar b) { + return ivec2_scalar{a.x - b.x, a.y - b.y}; + } + friend ivec2_scalar operator-(ivec2_scalar a, int b) { + return ivec2_scalar{a.x - b, a.y - b}; + } + + friend bool operator==(const ivec2_scalar& l, const ivec2_scalar& r) { + return l.x == r.x && l.y == r.y; + } +}; + +struct ivec2 { + typedef int32_t element_type; + + ivec2() : ivec2(I32(0)) {} + IMPLICIT ivec2(I32 a) : x(a), y(a) {} + ivec2(I32 x, I32 y) : x(x), y(y) {} + IMPLICIT ivec2(vec2 a) : x(cast(a.x)), y(cast(a.y)) {} + ivec2(U32 x, U32 y) : x(CONVERT(x, I32)), y(CONVERT(y, I32)) {} + IMPLICIT constexpr ivec2(ivec2_scalar s) : x(s.x), y(s.y) {} + constexpr ivec2(ivec2_scalar s0, ivec2_scalar s1, ivec2_scalar s2, + ivec2_scalar s3) + : x(I32{s0.x, s1.x, s2.x, s3.x}), y(I32{s0.y, s1.y, s2.y, s3.y}) {} + I32 x; + I32 y; + + I32& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + default: + UNREACHABLE; + } + } + I32& sel(XYZW c1) { return select(c1); } + + ivec2 sel(XYZW c1, XYZW c2) { return ivec2(select(c1), select(c2)); } + + ivec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4); + + ivec2& operator*=(I32 a) { + x *= a; + y *= a; + return *this; + } + ivec2& operator+=(ivec2 a) { + x += a.x; + y += a.y; + return *this; + } + ivec2& operator>>=(int shift) { + x >>= shift; + y >>= shift; + return *this; + } + + friend ivec2 operator*(ivec2 a, I32 b) { return ivec2(a.x * b, a.y * b); } + friend ivec2 operator&(ivec2 a, ivec2 b) { + return ivec2(a.x & b.x, a.y & b.y); + } + friend ivec2 operator&(ivec2 a, I32 b) { return ivec2(a.x & b, a.y & b); } + friend ivec2 operator+(ivec2 a, ivec2 b) { + return ivec2(a.x + b.x, a.y + b.y); + } +}; + +vec2::vec2(ivec2 a) : x(cast(a.x)), y(cast(a.y)) {} + +ivec2_scalar make_ivec2(int32_t n) { return ivec2_scalar{n, n}; } + +ivec2_scalar make_ivec2(uint32_t n) { + return ivec2_scalar{int32_t(n), int32_t(n)}; +} + +ivec2_scalar make_ivec2(int32_t x, int32_t y) { return ivec2_scalar{x, y}; } + +ivec2_scalar make_ivec2(uint32_t x, uint32_t y) { + return ivec2_scalar{int32_t(x), int32_t(y)}; +} + +vec2_scalar make_vec2(const ivec2_scalar& v) { + return vec2_scalar{float(v.x), float(v.y)}; +} + +ivec2_scalar make_ivec2(const vec2_scalar& v) { + return ivec2_scalar{int32_t(v.x), int32_t(v.y)}; +} + +template +ivec2 make_ivec2(const N& n) { + return ivec2(n); +} + +template +ivec2 make_ivec2(const X& x, const Y& y) { + return ivec2(x, y); +} + +ivec2_scalar force_scalar(const ivec2& v) { + return ivec2_scalar{force_scalar(v.x), force_scalar(v.y)}; +} + +struct ivec3_scalar { + int32_t x; + int32_t y; + int32_t z; + + ivec3_scalar() : ivec3_scalar(0) {} + IMPLICIT constexpr ivec3_scalar(int32_t a) : x(a), y(a), z(a) {} + constexpr ivec3_scalar(int32_t x, int32_t y, int32_t z) : x(x), y(y), z(z) {} + + int32_t& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + default: + UNREACHABLE; + } + } + int32_t& sel(XYZW c1) { return select(c1); } + ivec2_scalar sel(XYZW c1, XYZW c2) { + return ivec2_scalar{select(c1), select(c2)}; + } +}; + +struct ivec3 { + ivec3() : ivec3(0) {} + IMPLICIT ivec3(I32 a) : x(a), y(a), z(a) {} + ivec3(I32 x, I32 y, I32 z) : x(x), y(y), z(z) {} + ivec3(ivec2 a, I32 b) : x(a.x), y(a.y), z(b) {} + ivec3(vec2 a, Float b) : x(cast(a.x)), y(cast(a.y)), z(cast(b)) {} + I32 x; + I32 y; + I32 z; + + friend ivec3 operator+(ivec3 a, ivec3 b) { + return ivec3(a.x + b.x, a.y + b.y, a.z + b.z); + } +}; + +vec2_scalar make_vec2(ivec3_scalar s) { + return vec2_scalar{float(s.x), float(s.y)}; +} + +ivec3_scalar make_ivec3(int32_t n) { return ivec3_scalar{n, n, n}; } + +ivec3_scalar make_ivec3(const ivec2_scalar& v, int32_t z) { + return ivec3_scalar{v.x, v.y, z}; +} + +ivec3_scalar make_ivec3(int32_t x, int32_t y, int32_t z) { + return ivec3_scalar{x, y, z}; +} + +template +ivec3 make_ivec3(const N& n) { + return ivec3(n); +} + +template +ivec3 make_ivec3(const X& x, const Y& y) { + return ivec3(x, y); +} + +template +ivec3 make_ivec3(const X& x, const Y& y, const Z& z) { + return ivec3(x, y, z); +} + +struct ivec4_scalar { + typedef int32_t element_type; + + int32_t x; + int32_t y; + int32_t z; + int32_t w; + + ivec4_scalar() : ivec4_scalar(0) {} + IMPLICIT constexpr ivec4_scalar(int32_t a) : x(a), y(a), z(a), w(a) {} + constexpr ivec4_scalar(int32_t x, int32_t y, int32_t z, int32_t w) + : x(x), y(y), z(z), w(w) {} + + int32_t& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + case W: + return w; + default: + UNREACHABLE; + } + } + int32_t& sel(XYZW c1) { return select(c1); } + ivec2_scalar sel(XYZW c1, XYZW c2) { + return ivec2_scalar{select(c1), select(c2)}; + } + + friend ivec4_scalar operator&(int32_t a, ivec4_scalar b) { + return ivec4_scalar{a & b.x, a & b.y, a & b.z, a & b.w}; + } + friend ivec4_scalar operator<<(ivec4_scalar a, int32_t b) { + return ivec4_scalar{a.x << b, a.y << b, a.z << b, a.w << b}; + } + + int32_t& operator[](int index) { + switch (index) { + case 0: + return x; + case 1: + return y; + case 2: + return z; + case 3: + return w; + default: + UNREACHABLE; + } + } +}; + +struct ivec4 { + typedef int32_t element_type; + + ivec4() : ivec4(I32(0)) {} + IMPLICIT ivec4(I32 a) : x(a), y(a), z(a), w(a) {} + ivec4(I32 x, I32 y, I32 z, I32 w) : x(x), y(y), z(z), w(w) {} + ivec4(ivec2 a, I32 b, I32 c) : x(a.x), y(a.y), z(b), w(c) {} + IMPLICIT constexpr ivec4(ivec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {} + constexpr ivec4(ivec4_scalar s0, ivec4_scalar s1, ivec4_scalar s2, + ivec4_scalar s3) + : x(I32{s0.x, s1.x, s2.x, s3.x}), + y(I32{s0.y, s1.y, s2.y, s3.y}), + z(I32{s0.z, s1.z, s2.z, s3.z}), + w(I32{s0.w, s1.w, s2.w, s3.w}) {} + + I32& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + case W: + return w; + default: + UNREACHABLE; + } + } + I32 sel(XYZW c1) { return select(c1); } + + ivec2 sel(XYZW c1, XYZW c2) { return ivec2(select(c1), select(c2)); } + + ivec3 sel(XYZW c1, XYZW c2, XYZW c3) { + return ivec3(select(c1), select(c2), select(c3)); + } + + friend ivec4 operator&(I32 a, ivec4 b) { + return ivec4(a & b.x, a & b.y, a & b.z, a & b.w); + } + + I32 x; + I32 y; + I32 z; + I32 w; +}; + +ivec4_scalar force_scalar(const ivec4& v) { + return ivec4_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z), + force_scalar(v.w)}; +} + +ivec4_scalar make_ivec4(int32_t n) { return ivec4_scalar{n, n, n, n}; } + +ivec4_scalar make_ivec4(const ivec2_scalar& xy, int32_t z, int32_t w) { + return ivec4_scalar{xy.x, xy.y, z, w}; +} + +ivec4_scalar make_ivec4(int32_t x, int32_t y, int32_t z, int32_t w) { + return ivec4_scalar{x, y, z, w}; +} + +template +ivec4 make_ivec4(const N& n) { + return ivec4(n); +} + +template +ivec4 make_ivec4(const X& x, const Y& y, const Z& z) { + return ivec4(x, y, z); +} + +template +ivec4 make_ivec4(const X& x, const Y& y, const Z& z, const W& w) { + return ivec4(x, y, z, w); +} + +SI ivec2 if_then_else(I32 c, ivec2 t, ivec2 e) { + return ivec2(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y)); +} + +SI ivec2 if_then_else(int32_t c, ivec2 t, ivec2 e) { return c ? t : e; } + +SI ivec4 if_then_else(I32 c, ivec4 t, ivec4 e) { + return ivec4(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y), + if_then_else(c, t.z, e.z), if_then_else(c, t.w, e.w)); +} + +SI ivec4 if_then_else(int32_t c, ivec4 t, ivec4 e) { return c ? t : e; } + +ivec4 operator&(I32 a, ivec4_scalar b) { + return ivec4(a & b.x, a & b.y, a & b.z, a & b.w); +} + +struct bvec3_scalar { + bool x; + bool y; + bool z; + + bvec3_scalar() : bvec3_scalar(false) {} + IMPLICIT constexpr bvec3_scalar(bool a) : x(a), y(a), z(a) {} + constexpr bvec3_scalar(bool x, bool y, bool z) : x(x), y(y), z(z) {} +}; + +struct bvec3_scalar1 { + bool x; + + IMPLICIT constexpr bvec3_scalar1(bool a) : x(a) {} + + operator bvec3_scalar() const { return bvec3_scalar(x); } +}; + +struct bvec3 { + bvec3() : bvec3(0) {} + IMPLICIT bvec3(Bool a) : x(a), y(a), z(a) {} + bvec3(Bool x, Bool y, Bool z) : x(x), y(y), z(z) {} + Bool& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + default: + UNREACHABLE; + } + } + Bool sel(XYZW c1) { return select(c1); } + + Bool x; + Bool y; + Bool z; +}; + +bvec3_scalar1 make_bvec3(bool n) { return bvec3_scalar1(n); } + +struct bvec4_scalar { + bool x; + bool y; + bool z; + bool w; + + bvec4_scalar() : bvec4_scalar(false) {} + IMPLICIT constexpr bvec4_scalar(bool a) : x(a), y(a), z(a), w(a) {} + constexpr bvec4_scalar(bool x, bool y, bool z, bool w) + : x(x), y(y), z(z), w(w) {} + + bool& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + case W: + return w; + default: + UNREACHABLE; + } + } + bool sel(XYZW c1) { return select(c1); } + bvec2_scalar sel(XYZW c1, XYZW c2) { + return bvec2_scalar(select(c1), select(c2)); + } +}; + +bvec4_scalar bvec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { + return bvec4_scalar{select(c1), select(c2), select(c3), select(c4)}; +} + +struct bvec4_scalar1 { + bool x; + + IMPLICIT constexpr bvec4_scalar1(bool a) : x(a) {} + + operator bvec4_scalar() const { return bvec4_scalar(x); } +}; + +struct bvec4 { + bvec4() : bvec4(0) {} + IMPLICIT bvec4(Bool a) : x(a), y(a), z(a), w(a) {} + bvec4(Bool x, Bool y, Bool z, Bool w) : x(x), y(y), z(z), w(w) {} + bvec4(bvec2 x, bvec2 y) : x(x.x), y(x.y), z(y.x), w(y.y) {} + Bool& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + case W: + return w; + default: + UNREACHABLE; + } + } + Bool sel(XYZW c1) { return select(c1); } + + Bool x; + Bool y; + Bool z; + Bool w; +}; + +bvec4_scalar1 make_bvec4(bool n) { return bvec4_scalar1(n); } + +bvec4_scalar make_bvec4(bool x, bool y, bool z, bool w) { + return bvec4_scalar{x, y, z, w}; +} + +bvec4_scalar make_bvec4(bvec2_scalar a, bvec2_scalar b) { + return bvec4_scalar{a.x, a.y, b.x, b.y}; +} + +template +bvec4 make_bvec4(const N& n) { + return bvec4(n); +} + +template +bvec4 make_bvec4(const X& x, const Y& y) { + return bvec4(x, y); +} + +template +bvec4 make_bvec4(const X& x, const Y& y, const Z& z, const W& w) { + return bvec4(x, y, z, w); +} + +struct vec2_ref { + vec2_ref(Float& x, Float& y) : x(x), y(y) {} + Float& x; + Float& y; + + Float& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + default: + UNREACHABLE; + } + } + Float& sel(XYZW c1) { return select(c1); } + + vec2_ref& operator=(const vec2& a) { + x = a.x; + y = a.y; + return *this; + } + + vec2_ref& operator/=(Float a) { + x /= a; + y /= a; + return *this; + } + + vec2_ref& operator/=(vec2 a) { + x /= a.x; + y /= a.y; + return *this; + } + + vec2_ref& operator+=(vec2 a) { + x += a.x; + y += a.y; + return *this; + } + vec2_ref& operator-=(vec2 a) { + x -= a.x; + y -= a.y; + return *this; + } + vec2_ref& operator*=(vec2 a) { + x *= a.x; + y *= a.y; + return *this; + } +}; + +struct vec3_scalar { + typedef struct vec3 vector_type; + typedef float element_type; + + float x; + float y; + float z; + + constexpr vec3_scalar() : vec3_scalar(0.0f) {} + IMPLICIT constexpr vec3_scalar(float a) : x(a), y(a), z(a) {} + constexpr vec3_scalar(float x, float y, float z) : x(x), y(y), z(z) {} + + float& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + default: + UNREACHABLE; + } + } + float& sel(XYZW c1) { return select(c1); } + vec2_scalar sel(XYZW c1, XYZW c2) { + return vec2_scalar(select(c1), select(c2)); + } + vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3) { + return vec3_scalar(select(c1), select(c2), select(c3)); + } + vec2_scalar_ref lsel(XYZW c1, XYZW c2) { + return vec2_scalar_ref(select(c1), select(c2)); + } + + friend vec3_scalar operator*(vec3_scalar a, vec3_scalar b) { + return vec3_scalar{a.x * b.x, a.y * b.y, a.z * b.z}; + } + friend vec3_scalar operator*(vec3_scalar a, float b) { + return vec3_scalar{a.x * b, a.y * b, a.z * b}; + } + + friend vec3_scalar operator-(vec3_scalar a, vec3_scalar b) { + return vec3_scalar{a.x - b.x, a.y - b.y, a.z - b.z}; + } + friend vec3_scalar operator-(vec3_scalar a, float b) { + return vec3_scalar{a.x - b, a.y - b, a.z - b}; + } + friend vec3_scalar operator+(vec3_scalar a, vec3_scalar b) { + return vec3_scalar{a.x + b.x, a.y + b.y, a.z + b.z}; + } + friend vec3_scalar operator+(vec3_scalar a, float b) { + return vec3_scalar{a.x + b, a.y + b, a.z + b}; + } + + friend vec3_scalar operator/(vec3_scalar a, vec3_scalar b) { + return vec3_scalar{a.x / b.x, a.y / b.y, a.z / b.z}; + } + friend vec3_scalar operator/(vec3_scalar a, float b) { + return vec3_scalar{a.x / b, a.y / b, a.z / b}; + } + + vec3_scalar operator+=(vec3_scalar a) { + x += a.x; + y += a.y; + z += a.z; + return *this; + } + + friend bool operator==(const vec3_scalar& l, const vec3_scalar& r) { + return l.x == r.x && l.y == r.y && l.z == r.z; + } +}; + +struct vec3_scalar_ref { + vec3_scalar_ref(float& x, float& y, float& z) : x(x), y(y), z(z) {} + float& x; + float& y; + float& z; + + float& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + default: + UNREACHABLE; + } + } + float& sel(XYZW c1) { return select(c1); } + + vec3_scalar_ref& operator=(const vec3_scalar& a) { + x = a.x; + y = a.y; + z = a.z; + return *this; + } + + operator vec3_scalar() const { return vec3_scalar{x, y, z}; } +}; + +struct vec3 { + typedef struct vec3 vector_type; + typedef float element_type; + + constexpr vec3() : vec3(Float(0.0f)) {} + IMPLICIT constexpr vec3(Float a) : x(a), y(a), z(a) {} + constexpr vec3(Float x, Float y, Float z) : x(x), y(y), z(z) {} + vec3(vec2 a, Float z) : x(a.x), y(a.y), z(z) {} + explicit vec3(vec4); + IMPLICIT constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {} + constexpr vec3(vec3_scalar s0, vec3_scalar s1, vec3_scalar s2, vec3_scalar s3) + : x(Float{s0.x, s1.x, s2.x, s3.x}), + y(Float{s0.y, s1.y, s2.y, s3.y}), + z(Float{s0.z, s1.z, s2.z, s3.z}) {} + Float x; + Float y; + Float z; + + Float& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + default: + UNREACHABLE; + } + } + Float& sel(XYZW c1) { return select(c1); } + + vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); } + + vec3 sel(XYZW c1, XYZW c2, XYZW c3) { + return vec3(select(c1), select(c2), select(c3)); + } + + vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4); + + vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); } + + friend vec3 operator*(vec3 a, Float b) { + return vec3(a.x * b, a.y * b, a.z * b); + } + friend vec3 operator*(vec3 a, vec3 b) { + return vec3(a.x * b.x, a.y * b.y, a.z * b.z); + } + friend vec3 operator*(Float a, vec3 b) { + return vec3(a * b.x, a * b.y, a * b.z); + } + + friend vec3 operator/(vec3 a, Float b) { + return vec3(a.x / b, a.y / b, a.z / b); + } + friend vec3 operator/(vec3 a, vec3 b) { + return vec3(a.x / b.x, a.y / b.y, a.z / b.z); + } + + friend I32 operator==(const vec3& l, const vec3& r) { + return l.x == r.x && l.y == r.y && l.z == r.z; + } + + friend vec3 operator-(vec3 a, Float b) { + return vec3(a.x - b, a.y - b, a.z - b); + } + friend vec3 operator-(vec3 a, vec3 b) { + return vec3(a.x - b.x, a.y - b.y, a.z - b.z); + } + friend vec3 operator+(vec3 a, Float b) { + return vec3(a.x + b, a.y + b, a.z + b); + } + friend vec3 operator+(vec3 a, vec3 b) { + return vec3(a.x + b.x, a.y + b.y, a.z + b.z); + } + + vec3 operator+=(vec3_scalar a) { + x += a.x; + y += a.y; + z += a.z; + return *this; + } + vec3& operator+=(vec3 a) { + x += a.x; + y += a.y; + z += a.z; + return *this; + } +}; + +vec3_scalar force_scalar(const vec3& v) { + return vec3_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z)}; +} + +vec3_scalar make_vec3(float n) { return vec3_scalar{n, n, n}; } + +vec3_scalar make_vec3(const vec2_scalar& v, float z) { + return vec3_scalar{v.x, v.y, z}; +} + +vec3_scalar make_vec3(float x, float y, float z) { + return vec3_scalar{x, y, z}; +} + +vec3_scalar make_vec3(int32_t x, int32_t y, float z) { + return vec3_scalar{float(x), float(y), z}; +} + +template +vec3 make_vec3(const N& n) { + return vec3(n); +} + +template +vec3 make_vec3(const X& x, const Y& y) { + return vec3(x, y); +} + +template +vec3 make_vec3(const X& x, const Y& y, const Z& z) { + return vec3(x, y, z); +} + +SI vec3 if_then_else(I32 c, vec3 t, vec3 e) { + return vec3(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y), + if_then_else(c, t.z, e.z)); +} + +SI vec3 if_then_else(int32_t c, vec3 t, vec3 e) { return c ? t : e; } + +SI vec3 if_then_else(ivec3 c, vec3 t, vec3 e) { + return vec3(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y), + if_then_else(c.z, t.z, e.z)); +} + +vec3 step(vec3 edge, vec3 x) { + return vec3(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z)); +} + +vec3_scalar step(vec3_scalar edge, vec3_scalar x) { + return vec3_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z)); +} + +SI vec3 min(vec3 a, vec3 b) { + return vec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); +} +SI vec3 min(vec3 a, Float b) { + return vec3(min(a.x, b), min(a.y, b), min(a.z, b)); +} +SI vec3_scalar min(vec3_scalar a, vec3_scalar b) { + return vec3_scalar{min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)}; +} + +SI vec3 max(vec3 a, vec3 b) { + return vec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); +} +SI vec3 max(vec3 a, Float b) { + return vec3(max(a.x, b), max(a.y, b), max(a.z, b)); +} +SI vec3_scalar max(vec3_scalar a, vec3_scalar b) { + return vec3_scalar{max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)}; +} + +vec3 pow(vec3 x, vec3 y) { + return vec3(pow(x.x, y.x), pow(x.y, y.y), pow(x.z, y.z)); +} + +struct vec3_ref { + vec3_ref(Float& x, Float& y, Float& z) : x(x), y(y), z(z) {} + Float& x; + Float& y; + Float& z; + vec3_ref& operator=(const vec3& a) { + x = a.x; + y = a.y; + z = a.z; + return *this; + } + + vec3_ref& operator/=(Float a) { + x /= a; + y /= a; + z /= a; + return *this; + } + + vec3_ref& operator*=(Float a) { + x *= a; + y *= a; + z *= a; + return *this; + } +}; + +struct vec4_scalar { + typedef struct vec4 vector_type; + typedef float element_type; + + float x; + float y; + float z; + float w; + + constexpr vec4_scalar() : vec4_scalar(0.0f) {} + IMPLICIT constexpr vec4_scalar(float a) : x(a), y(a), z(a), w(a) {} + constexpr vec4_scalar(float x, float y, float z, float w) + : x(x), y(y), z(z), w(w) {} + vec4_scalar(vec3_scalar xyz, float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} + + static vec4_scalar load_from_ptr(const float* f) { + return vec4_scalar(f[0], f[1], f[2], f[3]); + } + + ALWAYS_INLINE float& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + case W: + return w; + default: + UNREACHABLE; + } + } + float& sel(XYZW c1) { return select(c1); } + vec2_scalar sel(XYZW c1, XYZW c2) { + return vec2_scalar{select(c1), select(c2)}; + } + vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3) { + return vec3_scalar{select(c1), select(c2), select(c3)}; + } + vec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { + return vec4_scalar{select(c1), select(c2), select(c3), select(c4)}; + } + vec2_scalar_ref lsel(XYZW c1, XYZW c2) { + return vec2_scalar_ref(select(c1), select(c2)); + } + vec3_scalar_ref lsel(XYZW c1, XYZW c2, XYZW c3) { + return vec3_scalar_ref(select(c1), select(c2), select(c3)); + } + + friend vec4_scalar operator*(vec4_scalar a, vec4_scalar b) { + return vec4_scalar{a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w}; + } + friend vec4_scalar operator*(vec4_scalar a, float b) { + return vec4_scalar{a.x * b, a.y * b, a.z * b, a.w * b}; + } + friend vec4_scalar operator*(float a, vec4_scalar b) { + return vec4_scalar{a * b.x, a * b.y, a * b.z, a * b.w}; + } + vec4_scalar& operator*=(float a) { + x *= a; + y *= a; + z *= a; + w *= a; + return *this; + } + + friend vec4_scalar operator-(vec4_scalar a, vec4_scalar b) { + return vec4_scalar{a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w}; + } + friend vec4_scalar operator-(vec4_scalar a, float b) { + return vec4_scalar{a.x - b, a.y - b, a.z - b, a.w - b}; + } + friend vec4_scalar operator+(vec4_scalar a, vec4_scalar b) { + return vec4_scalar{a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w}; + } + friend vec4_scalar operator+(vec4_scalar a, float b) { + return vec4_scalar{a.x + b, a.y + b, a.z + b, a.w + b}; + } + + friend vec4_scalar operator/(vec4_scalar a, vec4_scalar b) { + return vec4_scalar{a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w}; + } + friend vec4_scalar operator/(vec4_scalar a, float b) { + return vec4_scalar{a.x / b, a.y / b, a.z / b, a.w / b}; + } + + vec4_scalar& operator+=(vec4_scalar a) { + x += a.x; + y += a.y; + z += a.z; + w += a.w; + return *this; + } + + vec4_scalar& operator/=(vec4_scalar a) { + x /= a.x; + y /= a.y; + z /= a.z; + w /= a.w; + return *this; + } + + vec4_scalar& operator*=(vec4_scalar a) { + x *= a.x; + y *= a.y; + z *= a.z; + w *= a.w; + return *this; + } + + friend bool operator==(const vec4_scalar& l, const vec4_scalar& r) { + return l.x == r.x && l.y == r.y && l.z == r.z && l.w == r.w; + } + + friend bool operator!=(const vec4_scalar& l, const vec4_scalar& r) { + return l.x != r.x || l.y != r.y || l.z != r.z || l.w != r.w; + } +}; + +vec3_scalar vec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3) { + return {select(c1), select(c2), select(c3)}; +} +vec4_scalar vec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { + return vec4_scalar{select(c1), select(c2), select(c3), select(c4)}; +} + +struct vec4_ref { + vec4_ref(Float& x, Float& y, Float& z, Float& w) : x(x), y(y), z(z), w(w) {} + Float& x; + Float& y; + Float& z; + Float& w; + + vec4_ref& operator=(const vec4& a); +}; + +struct vec4 { + typedef struct vec4 vector_type; + typedef float element_type; + + constexpr vec4() : vec4(Float(0.0f)) {} + IMPLICIT constexpr vec4(Float a) : x(a), y(a), z(a), w(a) {} + vec4(Float x, Float y, Float z, Float w) : x(x), y(y), z(z), w(w) {} + vec4(vec3 xyz, Float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} + vec4(vec2 xy, vec2 zw) : x(xy.x), y(xy.y), z(zw.x), w(zw.y) {} + vec4(vec2 xy, Float z, Float w) : x(xy.x), y(xy.y), z(z), w(w) {} + vec4(Float x, Float y, vec2 zw) : x(x), y(y), z(zw.x), w(zw.y) {} + IMPLICIT constexpr vec4(vec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {} + constexpr vec4(vec4_scalar s0, vec4_scalar s1, vec4_scalar s2, vec4_scalar s3) + : x(Float{s0.x, s1.x, s2.x, s3.x}), + y(Float{s0.y, s1.y, s2.y, s3.y}), + z(Float{s0.z, s1.z, s2.z, s3.z}), + w(Float{s0.w, s1.w, s2.w, s3.w}) {} + ALWAYS_INLINE Float& select(XYZW c) { + switch (c) { + case X: + return x; + case Y: + return y; + case Z: + return z; + case W: + return w; + default: + UNREACHABLE; + } + } + ALWAYS_INLINE Float& sel(XYZW c1) { return select(c1); } + + ALWAYS_INLINE vec2 sel(XYZW c1, XYZW c2) { + return vec2(select(c1), select(c2)); + } + + ALWAYS_INLINE vec3 sel(XYZW c1, XYZW c2, XYZW c3) { + return vec3(select(c1), select(c2), select(c3)); + } + ALWAYS_INLINE vec3_ref lsel(XYZW c1, XYZW c2, XYZW c3) { + return vec3_ref(select(c1), select(c2), select(c3)); + } + + ALWAYS_INLINE vec2_ref lsel(XYZW c1, XYZW c2) { + return vec2_ref(select(c1), select(c2)); + } + + ALWAYS_INLINE vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { + return vec4(select(c1), select(c2), select(c3), select(c4)); + } + ALWAYS_INLINE vec4_ref lsel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { + return vec4_ref(select(c1), select(c2), select(c3), select(c4)); + } + + Float& operator[](int index) { + switch (index) { + case 0: + return x; + case 1: + return y; + case 2: + return z; + case 3: + return w; + default: + UNREACHABLE; + } + } + + // glsl supports non-const indexing of vecs. + // hlsl doesn't. The code it generates is probably not wonderful. + Float operator[](I32 index) { + float sel_x = 0; + switch (index.x) { + case 0: + sel_x = x.x; + break; + case 1: + sel_x = y.x; + break; + case 2: + sel_x = z.x; + break; + case 3: + sel_x = w.x; + break; + } + float sel_y = 0; + switch (index.y) { + case 0: + sel_y = x.y; + break; + case 1: + sel_y = y.y; + break; + case 2: + sel_y = z.y; + break; + case 3: + sel_y = w.y; + break; + } + float sel_z = 0; + switch (index.z) { + case 0: + sel_z = x.z; + break; + case 1: + sel_z = y.z; + break; + case 2: + sel_z = z.z; + break; + case 3: + sel_z = w.z; + break; + } + float sel_w = 0; + switch (index.w) { + case 0: + sel_w = x.w; + break; + case 1: + sel_w = y.w; + break; + case 2: + sel_w = z.w; + break; + case 3: + sel_w = w.w; + break; + } + Float ret = {sel_x, sel_y, sel_z, sel_w}; + return ret; + } + + friend vec4 operator/(vec4 a, Float b) { + return vec4(a.x / b, a.y / b, a.z / b, a.w / b); + } + friend vec4 operator/(vec4 a, vec4 b) { + return vec4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); + } + + friend vec4 operator*(vec4 a, Float b) { + return vec4(a.x * b, a.y * b, a.z * b, a.w * b); + } + + friend vec4 operator*(Float b, vec4 a) { + return vec4(a.x * b, a.y * b, a.z * b, a.w * b); + } + friend vec4 operator*(vec4 a, vec4 b) { + return vec4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); + } + + friend vec4 operator-(vec4 a, vec4 b) { + return vec4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); + } + friend vec4 operator+(vec4 a, vec4 b) { + return vec4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); + } + vec4& operator+=(vec4 a) { + x += a.x; + y += a.y; + z += a.z; + w += a.w; + return *this; + } + vec4& operator/=(vec4 a) { + x /= a.x; + y /= a.y; + z /= a.z; + w /= a.w; + return *this; + } + vec4& operator*=(vec4 a) { + x *= a.x; + y *= a.y; + z *= a.z; + w *= a.w; + return *this; + } + vec4& operator*=(Float a) { + x *= a; + y *= a; + z *= a; + w *= a; + return *this; + } + + Float x; + Float y; + Float z; + Float w; +}; + +inline vec4_ref& vec4_ref::operator=(const vec4& a) { + x = a.x; + y = a.y; + z = a.z; + w = a.w; + return *this; +} + +inline vec4 vec3::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { + return vec4(select(c1), select(c2), select(c3), select(c4)); +} + +vec4_scalar force_scalar(const vec4& v) { + return vec4_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z), + force_scalar(v.w)}; +} + +vec4_scalar make_vec4(float n) { return vec4_scalar{n, n, n, n}; } + +vec4_scalar make_vec4(const vec2_scalar& v, float z, float w) { + return vec4_scalar{v.x, v.y, z, w}; +} + +vec4_scalar make_vec4(const vec2_scalar& a, const vec2_scalar& b) { + return vec4_scalar{a.x, a.y, b.x, b.y}; +} + +vec4_scalar make_vec4(const vec3_scalar& v, float w) { + return vec4_scalar{v.x, v.y, v.z, w}; +} + +vec4_scalar make_vec4(float x, float y, float z, float w) { + return vec4_scalar{x, y, z, w}; +} + +vec4_scalar make_vec4(float x, float y, const vec2_scalar& v) { + return vec4_scalar{x, y, v.x, v.y}; +} + +ivec4_scalar make_ivec4(const vec4_scalar& v) { + return ivec4_scalar{int32_t(v.x), int32_t(v.y), int32_t(v.z), int32_t(v.w)}; +} + +template +vec4 make_vec4(const N& n) { + return vec4(n); +} + +template +vec4 make_vec4(const X& x, const Y& y) { + return vec4(x, y); +} + +template +vec4 make_vec4(const X& x, const Y& y, const Z& z) { + return vec4(x, y, z); +} + +template +vec4 make_vec4(const X& x, const Y& y, const Z& z, const W& w) { + return vec4(x, y, z, w); +} + +vec4_scalar make_vec4(const ivec4_scalar& v) { + return vec4_scalar{float(v.x), float(v.y), float(v.z), float(v.w)}; +} + +ALWAYS_INLINE vec3::vec3(vec4 v) : x(v.x), y(v.y), z(v.z) {} + +SI ivec4 roundfast(vec4 v, Float scale) { + return ivec4(roundfast(v.x, scale), roundfast(v.y, scale), + roundfast(v.z, scale), roundfast(v.w, scale)); +} + +vec4 operator*(vec4_scalar a, Float b) { + return vec4(a.x * b, a.y * b, a.z * b, a.w * b); +} + +SI vec4 if_then_else(I32 c, vec4 t, vec4 e) { + return vec4(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y), + if_then_else(c, t.z, e.z), if_then_else(c, t.w, e.w)); +} + +SI vec4 if_then_else(int32_t c, vec4 t, vec4 e) { return c ? t : e; } + +SI vec4_scalar if_then_else(int32_t c, vec4_scalar t, vec4_scalar e) { + return c ? t : e; +} + +SI vec2 clamp(vec2 a, Float minVal, Float maxVal) { + return vec2(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal)); +} + +SI vec2 clamp(vec2 a, vec2 minVal, vec2 maxVal) { + return vec2(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y)); +} + +SI vec2_scalar clamp(vec2_scalar a, vec2_scalar minVal, vec2_scalar maxVal) { + return vec2_scalar{clamp(a.x, minVal.x, maxVal.x), + clamp(a.y, minVal.y, maxVal.y)}; +} + +SI vec2_scalar clamp(vec2_scalar a, float minVal, float maxVal) { + return vec2_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal)}; +} + +SI I32 clamp(I32 a, I32 minVal, I32 maxVal) { + a = if_then_else(a < minVal, minVal, a); + return if_then_else(a > maxVal, maxVal, a); +} + +SI vec3 clamp(vec3 a, Float minVal, Float maxVal) { + return vec3(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal), + clamp(a.z, minVal, maxVal)); +} + +SI vec3 clamp(vec3 a, vec3 minVal, vec3 maxVal) { + return vec3(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y), + clamp(a.z, minVal.z, maxVal.z)); +} + +SI vec4 clamp(vec4 a, Float minVal, Float maxVal) { + return vec4(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal), + clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal)); +} + +SI vec4 clamp(vec4 a, vec4 minVal, vec4 maxVal) { + return vec4(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y), + clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w)); +} + +SI vec4_scalar clamp(vec4_scalar a, vec4_scalar minVal, vec4_scalar maxVal) { + return vec4_scalar{ + clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y), + clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w)}; +} + +SI vec4_scalar clamp(vec4_scalar a, float minVal, float maxVal) { + return vec4_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal), + clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal)}; +} + +vec4 step(vec4 edge, vec4 x) { + return vec4(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z), + step(edge.w, x.w)); +} + +vec4_scalar step(vec4_scalar edge, vec4_scalar x) { + return vec4_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z), + step(edge.w, x.w)); +} + +template +auto lessThanEqual(T x, T y) -> decltype(x <= y) { + return x <= y; +} + +template +auto lessThan(T x, T y) -> decltype(x < y) { + return x < y; +} + +SI bvec3 lessThanEqual(vec3 x, vec3 y) { + return bvec3(lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y), + lessThanEqual(x.z, y.z)); +} + +SI bvec2 lessThanEqual(vec2 x, vec2 y) { + return bvec2(lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y)); +} + +SI bvec2_scalar lessThanEqual(vec2_scalar x, vec2_scalar y) { + return bvec2_scalar{lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y)}; +} + +SI bvec4 lessThanEqual(vec4 x, vec4 y) { + return bvec4(lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y), + lessThanEqual(x.z, y.z), lessThanEqual(x.w, y.w)); +} + +SI bvec4_scalar lessThanEqual(vec4_scalar x, vec4_scalar y) { + return bvec4_scalar{lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y), + lessThanEqual(x.z, y.z), lessThanEqual(x.w, y.w)}; +} + +SI bvec2 lessThan(vec2 x, vec2 y) { + return bvec2(lessThan(x.x, y.x), lessThan(x.y, y.y)); +} + +SI bvec2_scalar lessThan(vec2_scalar x, vec2_scalar y) { + return bvec2_scalar(lessThan(x.x, y.x), lessThan(x.y, y.y)); +} + +SI bvec4 lessThan(vec4 x, vec4 y) { + return bvec4(lessThan(x.x, y.x), lessThan(x.y, y.y), lessThan(x.z, y.z), + lessThan(x.w, y.w)); +} + +SI bvec4_scalar lessThan(vec4_scalar x, vec4_scalar y) { + return bvec4_scalar{lessThan(x.x, y.x), lessThan(x.y, y.y), + lessThan(x.z, y.z), lessThan(x.w, y.w)}; +} + +template +auto greaterThan(T x, T y) -> decltype(x > y) { + return x > y; +} + +bvec2 greaterThan(vec2 x, vec2 y) { + return bvec2(greaterThan(x.x, y.x), greaterThan(x.y, y.y)); +} + +bvec2_scalar greaterThan(vec2_scalar x, vec2_scalar y) { + return bvec2_scalar(greaterThan(x.x, y.x), greaterThan(x.y, y.y)); +} + +SI bvec4 greaterThan(vec4 x, vec4 y) { + return bvec4(greaterThan(x.x, y.x), greaterThan(x.y, y.y), + greaterThan(x.z, y.z), greaterThan(x.w, y.w)); +} + +SI bvec4_scalar greaterThan(vec4_scalar x, vec4_scalar y) { + return bvec4_scalar{greaterThan(x.x, y.x), greaterThan(x.y, y.y), + greaterThan(x.z, y.z), greaterThan(x.w, y.w)}; +} + +template +auto greaterThanEqual(T x, T y) -> decltype(x >= y) { + return x >= y; +} + +bvec4 greaterThanEqual(vec4 x, vec4 y) { + return bvec4(greaterThanEqual(x.x, y.x), greaterThanEqual(x.y, y.y), + greaterThanEqual(x.z, y.z), greaterThanEqual(x.w, y.w)); +} + +template +auto equal(T x, T y) -> decltype(x > y) { + return x == y; +} + +bvec2 equal(vec2 x, vec2 y) { return bvec2(equal(x.x, y.x), equal(x.y, y.y)); } + +bvec2_scalar equal(vec2_scalar x, vec2_scalar y) { + return bvec2_scalar(equal(x.x, y.x), equal(x.y, y.y)); +} + +template +auto notEqual(T x, T y) -> decltype(x > y) { + return x != y; +} + +bvec2 notEqual(vec2 x, vec2 y) { + return bvec2(notEqual(x.x, y.x), notEqual(x.y, y.y)); +} + +bvec2_scalar notEqual(vec2_scalar x, vec2_scalar y) { + return bvec2_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y)); +} + +struct mat4_scalar; + +struct mat2_scalar { + vec2_scalar data[2]; + + mat2_scalar() = default; + IMPLICIT constexpr mat2_scalar(float a) + : data{vec2_scalar(a), vec2_scalar(a)} {} + constexpr mat2_scalar(vec2_scalar a, vec2_scalar b) : data{a, b} {} + IMPLICIT mat2_scalar(const mat4_scalar& mat); + + vec2_scalar& operator[](int index) { return data[index]; } + const vec2_scalar& operator[](int index) const { return data[index]; } + + friend vec2_scalar operator*(mat2_scalar m, vec2_scalar v) { + vec2_scalar u; + u.x = m[0].x * v.x + m[1].x * v.y; + u.y = m[0].y * v.x + m[1].y * v.y; + return u; + } + + friend vec2 operator*(mat2_scalar m, vec2 v) { + vec2 u; + u.x = m[0].x * v.x + m[1].x * v.y; + u.y = m[0].y * v.x + m[1].y * v.y; + return u; + } + + friend mat2_scalar operator*(mat2_scalar m, float f) { + mat2_scalar u = m; + u[0].x *= f; + u[0].y *= f; + u[1].x *= f; + u[1].y *= f; + return u; + } +}; + +struct mat4; + +struct mat2 { + vec2 data[2]; + + vec2& operator[](int index) { return data[index]; } + const vec2& operator[](int index) const { return data[index]; } + mat2() = default; + + IMPLICIT constexpr mat2(Float a) : data{vec2(a), vec2(a)} {} + + constexpr mat2(vec2 a, vec2 b) : data{a, b} {} + IMPLICIT mat2(const mat4& mat); + IMPLICIT constexpr mat2(mat2_scalar s) + : data{vec2(s.data[0]), vec2(s.data[1])} {} + + friend vec2 operator*(mat2 m, vec2 v) { + vec2 u; + u.x = m[0].x * v.x + m[1].x * v.y; + u.y = m[0].y * v.x + m[1].y * v.y; + return u; + } + friend mat2 operator*(mat2 m, Float f) { + mat2 u = m; + u[0].x *= f; + u[0].y *= f; + u[1].x *= f; + u[1].y *= f; + return u; + } +}; + +mat2_scalar make_mat2(float n) { return mat2_scalar{{n, n}, {n, n}}; } + +mat2_scalar make_mat2(const mat2_scalar& m) { return m; } + +mat2_scalar make_mat2(const vec2_scalar& x, const vec2_scalar& y) { + return mat2_scalar{x, y}; +} + +template +mat2 make_mat2(const N& n) { + return mat2(n); +} + +template +mat2 make_mat2(const X& x, const Y& y) { + return mat2(x, y); +} + +SI mat2 if_then_else(I32 c, mat2 t, mat2 e) { + return mat2(if_then_else(c, t[0], e[0]), if_then_else(c, t[0], e[1])); +} + +SI mat2 if_then_else(int32_t c, mat2 t, mat2 e) { return c ? t : e; } + +struct mat3_scalar { + vec3_scalar data[3]; + + mat3_scalar() = default; + constexpr mat3_scalar(vec3_scalar a, vec3_scalar b, vec3_scalar c) + : data{a, b, c} {} + IMPLICIT mat3_scalar(const mat4_scalar& mat); + + vec3_scalar& operator[](int index) { return data[index]; } + const vec3_scalar& operator[](int index) const { return data[index]; } + + friend vec3_scalar operator*(mat3_scalar m, vec3_scalar v) { + vec3_scalar u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z; + return u; + } + + friend vec3 operator*(mat3_scalar m, vec3 v) { + vec3 u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z; + return u; + } + + friend auto operator*(mat3_scalar a, mat3_scalar b) { + mat3_scalar r; + for (int c = 0; c < 3; c++) { + const auto& v = b[c]; + r[c].x = a[0].x * v.x + a[1].x * v.y + a[2].x * v.z; + r[c].y = a[0].y * v.x + a[1].y * v.y + a[2].y * v.z; + r[c].z = a[0].z * v.x + a[1].z * v.y + a[2].z * v.z; + } + return r; + } +}; + +struct mat3 { + vec3 data[3]; + + vec3& operator[](int index) { return data[index]; } + const vec3& operator[](int index) const { return data[index]; } + mat3() = default; + constexpr mat3(vec3 a, vec3 b, vec3 c) : data{a, b, c} {} + + IMPLICIT constexpr mat3(mat3_scalar s) + : data{vec3(s.data[0]), vec3(s.data[1]), vec3(s.data[2])} {} + + constexpr mat3(mat3_scalar s0, mat3_scalar s1, mat3_scalar s2, mat3_scalar s3) + : data{vec3(s0.data[0], s1.data[0], s2.data[0], s3.data[0]), + vec3(s0.data[1], s1.data[1], s2.data[1], s3.data[1]), + vec3(s0.data[2], s1.data[2], s2.data[2], s3.data[2])} {} + + constexpr mat3(Float d1, Float d2, Float d3, Float d4, Float d5, Float d6, + Float d7, Float d8, Float d9) + : data{vec3(d1, d2, d3), vec3(d4, d5, d6), vec3(d7, d8, d9)} {} + + IMPLICIT mat3(const mat4& mat); + + friend vec3 operator*(mat3 m, vec3 v) { + vec3 u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z; + return u; + } +}; + +mat3_scalar force_scalar(const mat3& v) { + return mat3_scalar{force_scalar(v[0]), force_scalar(v[1]), + force_scalar(v[2])}; +} + +mat3_scalar make_mat3(const mat3_scalar& m) { return m; } + +mat3_scalar make_mat3(const vec3_scalar& x, const vec3_scalar& y, + const vec3_scalar& z) { + return mat3_scalar{x, y, z}; +} + +constexpr mat3_scalar make_mat3(float m0, float m1, float m2, float m3, + float m4, float m5, float m6, float m7, + float m8) { + return mat3_scalar{{m0, m1, m2}, {m3, m4, m5}, {m6, m7, m8}}; +} + +template +mat3 make_mat3(const N& n) { + return mat3(n); +} + +template +mat3 make_mat3(const X& x, const Y& y, const Z& z) { + return mat3(x, y, z); +} + +struct mat3x4_scalar { + vec4_scalar data[3]; + + mat3x4_scalar() = default; + constexpr mat3x4_scalar(vec4_scalar a, vec4_scalar b, vec4_scalar c) + : data{a, b, c} {} + + auto& operator[](int index) { return data[index]; } + constexpr auto operator[](int index) const { return data[index]; } + + friend auto operator*(mat3x4_scalar m, vec3_scalar v) { + vec4_scalar u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z; + u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z; + return u; + } + + friend auto operator*(mat3x4_scalar m, vec3 v) { + vec4 u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z; + u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z; + return u; + } +}; + +constexpr mat3x4_scalar make_mat3x4(float m0, float m1, float m2, float m3, + float m4, float m5, float m6, float m7, + float m8, float m9, float m10, float m11) { + return mat3x4_scalar{ + {m0, m1, m2, m3}, + {m4, m5, m6, m7}, + {m8, m9, m10, m11}, + }; +} + +struct mat4x3_scalar { + vec3_scalar data[4]; + + mat4x3_scalar() = default; + constexpr mat4x3_scalar(vec3_scalar a, vec3_scalar b, vec3_scalar c, + vec3_scalar d) + : data{a, b, c, d} {} + + auto& operator[](int index) { return data[index]; } + constexpr auto operator[](int index) const { return data[index]; } + + friend auto operator*(mat4x3_scalar m, vec4_scalar v) { + vec3_scalar u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w; + return u; + } + + friend auto operator*(mat4x3_scalar m, vec4 v) { + vec3 u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w; + return u; + } +}; + +constexpr mat4x3_scalar transpose(const mat3x4_scalar m) { + return {{m[0].x, m[1].x, m[2].x}, + {m[0].y, m[1].y, m[2].y}, + {m[0].z, m[1].z, m[2].z}, + {m[0].w, m[1].w, m[2].w}}; +} + +struct mat4_scalar { + vec4_scalar data[4]; + + mat4_scalar() = default; + constexpr mat4_scalar(vec4_scalar a, vec4_scalar b, vec4_scalar c, + vec4_scalar d) + : data{a, b, c, d} {} + + vec4_scalar& operator[](int index) { return data[index]; } + const vec4_scalar& operator[](int index) const { return data[index]; } + + static mat4_scalar load_from_ptr(const float* f) { + return mat4_scalar( + vec4_scalar::load_from_ptr(&f[0]), vec4_scalar::load_from_ptr(&f[4]), + vec4_scalar::load_from_ptr(&f[8]), vec4_scalar::load_from_ptr(&f[12])); + } + + friend vec4_scalar operator*(mat4_scalar m, vec4_scalar v) { + vec4_scalar u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w; + u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z + m[3].w * v.w; + return u; + } + + friend vec4 operator*(mat4_scalar m, vec4 v) { + vec4 u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w; + u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z + m[3].w * v.w; + return u; + } +}; + +struct mat4 { + vec4 data[4]; + + mat4() = default; + IMPLICIT constexpr mat4(mat4_scalar s) + : data{vec4(s.data[0]), vec4(s.data[1]), vec4(s.data[2]), + vec4(s.data[3])} {} + + constexpr mat4(vec4 a, vec4 b, vec4 c, vec4 d) : data{a, b, c, d} {} + + vec4& operator[](int index) { return data[index]; } + const vec4& operator[](int index) const { return data[index]; } + + friend vec4 operator*(mat4 m, vec4 v) { + vec4 u; + u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w; + u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w; + u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w; + u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z + m[3].w * v.w; + return u; + } +}; + +mat3::mat3(const mat4& mat) + : mat3(vec3(mat[0].x, mat[0].y, mat[0].z), + vec3(mat[1].x, mat[1].y, mat[1].z), + vec3(mat[2].x, mat[2].y, mat[2].z)) {} + +IMPLICIT mat3_scalar::mat3_scalar(const mat4_scalar& mat) + : mat3_scalar(vec3_scalar(mat[0].x, mat[0].y, mat[0].z), + vec3_scalar(mat[1].x, mat[1].y, mat[1].z), + vec3_scalar(mat[2].x, mat[2].y, mat[2].z)) {} + +IMPLICIT mat2::mat2(const mat4& mat) + : mat2(vec2(mat[0].x, mat[0].y), vec2(mat[1].x, mat[1].y)) {} + +IMPLICIT mat2_scalar::mat2_scalar(const mat4_scalar& mat) + : mat2_scalar(vec2_scalar(mat[0].x, mat[0].y), + vec2_scalar(mat[1].x, mat[1].y)) {} + +mat2_scalar make_mat2(const mat4_scalar& m) { return mat2_scalar(m); } + +mat3_scalar make_mat3(const mat4_scalar& m) { return mat3_scalar(m); } + +mat4_scalar force_scalar(const mat4& v) { + return mat4_scalar(force_scalar(v[0]), force_scalar(v[1]), force_scalar(v[2]), + force_scalar(v[3])); +} + +mat4_scalar make_mat4(const mat4_scalar& m) { return m; } + +mat4_scalar make_mat4(const vec4_scalar& x, const vec4_scalar& y, + const vec4_scalar& z, const vec4_scalar& w) { + return mat4_scalar{x, y, z, w}; +} + +constexpr mat4_scalar make_mat4(float m0, float m1, float m2, float m3, + float m4, float m5, float m6, float m7, + float m8, float m9, float m10, float m11, + float m12, float m13, float m14, float m15) { + return mat4_scalar{{m0, m1, m2, m3}, + {m4, m5, m6, m7}, + {m8, m9, m10, m11}, + {m12, m13, m14, m15}}; +} + +template +mat4 make_mat4(const N& n) { + return mat4(n); +} + +template +mat4 make_mat4(const X& x, const Y& y, const Z& z, const W& w) { + return mat4(x, y, z, w); +} + +SI mat3 if_then_else(I32 c, mat3 t, mat3 e) { + return mat3{if_then_else(c, t[0], e[0]), if_then_else(c, t[1], e[1]), + if_then_else(c, t[2], e[2])}; +} + +SI mat3 if_then_else(int32_t c, mat3 t, mat3 e) { return c ? t : e; } + +SI mat4 if_then_else(I32 c, mat4 t, mat4 e) { + return mat4{if_then_else(c, t[0], e[0]), if_then_else(c, t[1], e[1]), + if_then_else(c, t[2], e[2]), if_then_else(c, t[3], e[3])}; +} + +SI mat4 if_then_else(int32_t c, mat4 t, mat4 e) { return c ? t : e; } + +template +SI R mix(T x, U y, A a) { + return (y - x) * a + x; +} + +SI Float mix(Float x, Float y, Float a) { return (y - x) * a + x; } + +template +SI T mix(T x, T y, float a) { + return (y - x) * a + x; +} + +template +SI T mix(T x, T y, vec2_scalar a) { + return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y)}; +} + +template +SI T mix(T x, T y, vec3_scalar a) { + return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z)}; +} + +template +SI T mix(T x, T y, vec4_scalar a) { + return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z), + mix(x.w, y.w, a.w)}; +} + +ivec4 ivec2::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { + return ivec4(select(c1), select(c2), select(c3), select(c4)); +} + +vec4 vec2::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { + return vec4(select(c1), select(c2), select(c3), select(c4)); +} + +bool any(bool x) { return x; } + +Bool any(bvec4 x) { return x.x | x.y | x.z | x.w; } + +bool any(bvec4_scalar x) { return x.x | x.y | x.z | x.w; } + +Bool any(bvec2 x) { return x.x | x.y; } + +bool any(bvec2_scalar x) { return x.x | x.y; } + +bool all(bool x) { return x; } + +Bool all(bvec2 x) { return x.x & x.y; } + +bool all(bvec2_scalar x) { return x.x & x.y; } + +Bool all(bvec4 x) { return x.x & x.y & x.z & x.w; } + +bool all(bvec4_scalar x) { return x.x & x.y & x.z & x.w; } + +SI vec4 if_then_else(bvec4 c, vec4 t, vec4 e) { + return vec4(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y), + if_then_else(c.z, t.z, e.z), if_then_else(c.w, t.w, e.w)); +} +SI vec3 if_then_else(bvec3 c, vec3 t, vec3 e) { + return vec3(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y), + if_then_else(c.z, t.z, e.z)); +} + +SI vec2 if_then_else(bvec2 c, vec2 t, vec2 e) { + return vec2(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y)); +} + +template +SI R mix(T x, T y, bvec4 a) { + return if_then_else(a, y, x); +} + +template +SI R mix(T x, T y, bvec3 a) { + return if_then_else(a, y, x); +} + +template +SI R mix(T x, T y, bvec2 a) { + return if_then_else(a, y, x); +} + +template +SI T mix(T x, T y, bvec4_scalar a) { + return T{a.x ? y.x : x.x, a.y ? y.y : x.y, a.z ? y.z : x.z, a.w ? y.w : x.w}; +} + +template +SI T mix(T x, T y, bvec4_scalar1 a) { + return a.x ? y : x; +} + +template +SI T mix(T x, T y, bvec3_scalar a) { + return T{a.x ? y.x : x.x, a.y ? y.y : x.y, a.z ? y.z : x.z}; +} + +template +SI T mix(T x, T y, bvec3_scalar1 a) { + return a.x ? y : x; +} + +template +SI T mix(T x, T y, bvec2_scalar a) { + return T{a.x ? y.x : x.x, a.y ? y.y : x.y}; +} + +template +SI T mix(T x, T y, bvec2_scalar1 a) { + return a.x ? y : x; +} + +float dot(vec3_scalar a, vec3_scalar b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +Float dot(vec3 a, vec3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } + +float dot(vec2_scalar a, vec2_scalar b) { return a.x * b.x + a.y * b.y; } + +Float dot(vec2 a, vec2 b) { return a.x * b.x + a.y * b.y; } + +#define sin __glsl_sin + +float sin(float x) { return sinf(x); } + +Float sin(Float v) { return {sinf(v.x), sinf(v.y), sinf(v.z), sinf(v.w)}; } + +#define cos __glsl_cos + +float cos(float x) { return cosf(x); } + +Float cos(Float v) { return {cosf(v.x), cosf(v.y), cosf(v.z), cosf(v.w)}; } + +#define tan __glsl_tan + +float tan(float x) { return tanf(x); } + +Float tan(Float v) { return {tanf(v.x), tanf(v.y), tanf(v.z), tanf(v.w)}; } + +#define atan __glsl_atan + +float atan(float x) { return atanf(x); } + +Float atan(Float v) { return {atanf(v.x), atanf(v.y), atanf(v.z), atanf(v.w)}; } + +float atan(float a, float b) { return atan2f(a, b); } + +Float atan(Float a, Float b) { + return {atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z), + atan2f(a.w, b.w)}; +} + +bvec4 equal(vec4 x, vec4 y) { + return bvec4(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z), + equal(x.w, y.w)); +} + +bvec4_scalar equal(vec4_scalar x, vec4_scalar y) { + return bvec4_scalar(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z), + equal(x.w, y.w)); +} + +bvec4 notEqual(vec4 x, vec4 y) { + return bvec4(notEqual(x.x, y.x), notEqual(x.y, y.y), notEqual(x.z, y.z), + notEqual(x.w, y.w)); +} + +bvec4_scalar notEqual(vec4_scalar x, vec4_scalar y) { + return bvec4_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y), + notEqual(x.z, y.z), notEqual(x.w, y.w)); +} + +bvec4 notEqual(ivec4 a, ivec4 b) { + return bvec4(a.x != b.x, a.y != b.y, a.z != b.z, a.w != b.w); +} + +bvec4_scalar notEqual(ivec4_scalar a, ivec4_scalar b) { + return bvec4_scalar{a.x != b.x, a.y != b.y, a.z != b.z, a.w != b.w}; +} + +mat3 transpose(mat3 m) { + return mat3(vec3(m[0].x, m[1].x, m[2].x), vec3(m[0].y, m[1].y, m[2].y), + vec3(m[0].z, m[1].z, m[2].z)); +} + +mat3_scalar transpose(mat3_scalar m) { + return mat3_scalar{vec3_scalar(m[0].x, m[1].x, m[2].x), + vec3_scalar(m[0].y, m[1].y, m[2].y), + vec3_scalar(m[0].z, m[1].z, m[2].z)}; +} + +vec2 abs(vec2 v) { return vec2(abs(v.x), abs(v.y)); } + +vec2_scalar abs(vec2_scalar v) { return vec2_scalar{fabsf(v.x), fabsf(v.y)}; } + +vec2 sign(vec2 v) { return vec2(sign(v.x), sign(v.y)); } + +vec2_scalar sign(vec2_scalar v) { return vec2_scalar{sign(v.x), sign(v.y)}; } + +Float mod(Float a, Float b) { return a - b * floor(a / b); } + +vec2 mod(vec2 a, vec2 b) { return vec2(mod(a.x, b.x), mod(a.y, b.y)); } + +vec3 abs(vec3 v) { return vec3(abs(v.x), abs(v.y), abs(v.z)); } + +vec3 sign(vec3 v) { return vec3(sign(v.x), sign(v.y), sign(v.z)); } + +mat2 inverse(mat2 v) { + Float det = v[0].x * v[1].y - v[0].y * v[1].x; + return mat2(vec2(v[1].y, -v[0].y), vec2(-v[1].x, v[0].x)) * (1. / det); +} + +mat2_scalar inverse(mat2_scalar v) { + float det = v[0].x * v[1].y - v[0].y * v[1].x; + return mat2_scalar{{v[1].y, -v[0].y}, {-v[1].x, v[0].x}} * (1. / det); +} + +int32_t get_nth(I32 a, int n) { return a[n]; } + +float get_nth(Float a, int n) { return a[n]; } + +float get_nth(float a, int) { return a; } + +ivec2_scalar get_nth(ivec2 a, int n) { return ivec2_scalar{a.x[n], a.y[n]}; } + +vec2_scalar get_nth(vec2 a, int n) { return vec2_scalar{a.x[n], a.y[n]}; } + +vec3_scalar get_nth(vec3 a, int n) { + return vec3_scalar{a.x[n], a.y[n], a.z[n]}; +} + +vec4_scalar get_nth(vec4 a, int n) { + return vec4_scalar{a.x[n], a.y[n], a.z[n], a.w[n]}; +} + +ivec4_scalar get_nth(ivec4 a, int n) { + return ivec4_scalar{a.x[n], a.y[n], a.z[n], a.w[n]}; +} + +mat3_scalar get_nth(mat3 a, int n) { + return make_mat3(get_nth(a[0], n), get_nth(a[1], n), get_nth(a[2], n)); +} + +void put_nth(Float& dst, int n, float src) { dst[n] = src; } + +void put_nth(I32& dst, int n, int32_t src) { dst[n] = src; } + +void put_nth(ivec2& dst, int n, ivec2_scalar src) { + dst.x[n] = src.x; + dst.y[n] = src.y; +} + +void put_nth(vec2& dst, int n, vec2_scalar src) { + dst.x[n] = src.x; + dst.y[n] = src.y; +} + +void put_nth(vec3& dst, int n, vec3_scalar src) { + dst.x[n] = src.x; + dst.y[n] = src.y; + dst.z[n] = src.z; +} + +void put_nth(ivec4& dst, int n, ivec4_scalar src) { + dst.x[n] = src.x; + dst.y[n] = src.y; + dst.z[n] = src.z; + dst.w[n] = src.w; +} + +void put_nth(vec4& dst, int n, vec4_scalar src) { + dst.x[n] = src.x; + dst.y[n] = src.y; + dst.z[n] = src.z; + dst.w[n] = src.w; +} + +// Use an ElementType type constructor +// so that we can implement element_type for +// Int and Float +template +struct ElementType { + typedef typename V::element_type ty; +}; + +template <> +struct ElementType { + typedef float ty; +}; + +template <> +struct ElementType { + typedef float ty; +}; + +template <> +struct ElementType { + typedef float ty; +}; + +template <> +struct ElementType { + typedef int32_t ty; +}; + +void put_nth_component(ivec2_scalar& dst, int n, int32_t src) { + switch (n) { + case 0: + dst.x = src; + break; + case 1: + dst.y = src; + break; + } +} + +void put_nth_component(ivec4_scalar& dst, int n, int32_t src) { + switch (n) { + case 0: + dst.x = src; + break; + case 1: + dst.y = src; + break; + case 2: + dst.z = src; + break; + case 3: + dst.w = src; + break; + } +} + +void put_nth_component(int& dst, int n, int src) { + switch (n) { + case 0: + dst = src; + break; + } +} + +void put_nth_component(float& dst, int n, float src) { + switch (n) { + case 0: + dst = src; + break; + } +} + +void put_nth_component(vec2_scalar& dst, int n, float src) { + switch (n) { + case 0: + dst.x = src; + break; + case 1: + dst.y = src; + break; + } +} + +void put_nth_component(vec3_scalar& dst, int n, float src) { + switch (n) { + case 0: + dst.x = src; + break; + case 1: + dst.y = src; + break; + case 2: + dst.z = src; + break; + } +} + +void put_nth_component(vec4_scalar& dst, int n, float src) { + switch (n) { + case 0: + dst.x = src; + break; + case 1: + dst.y = src; + break; + case 2: + dst.z = src; + break; + case 3: + dst.w = src; + break; + } +} + +Float init_interp(float init0, float step) { + float init1 = init0 + step; + float init2 = init1 + step; + float init3 = init2 + step; + return {init0, init1, init2, init3}; +} + +vec2 init_interp(vec2_scalar init, vec2_scalar step) { + return vec2(init_interp(init.x, step.x), init_interp(init.y, step.y)); +} + +vec3 init_interp(vec3_scalar init, vec3_scalar step) { + return vec3(init_interp(init.x, step.x), init_interp(init.y, step.y), + init_interp(init.z, step.z)); +} + +vec4 init_interp(vec4_scalar init, vec4_scalar step) { + return vec4(init_interp(init.x, step.x), init_interp(init.y, step.y), + init_interp(init.z, step.z), init_interp(init.w, step.w)); +} + +template +struct Array { + T elements[N]; + T& operator[](size_t i) { return elements[i]; } + const T& operator[](size_t i) const { return elements[i]; } + template + void convert(const Array& s) { + for (size_t i = 0; i < N; ++i) elements[i] = T(s[i]); + } +}; + +template +Array if_then_else(I32 c, Array t, + Array e) { + Array r; + for (size_t i = 0; i < SIZE; i++) { + r[i] = if_then_else(c, t[i], e[i]); + } + return r; +} + +} // namespace glsl diff --git a/gfx/wr/swgl/src/lib.rs b/gfx/wr/swgl/src/lib.rs new file mode 100644 index 0000000000..e8fc030e0c --- /dev/null +++ b/gfx/wr/swgl/src/lib.rs @@ -0,0 +1,12 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#![crate_name = "swgl"] +#![crate_type = "lib"] + +extern crate gleam; + +mod swgl_fns; + +pub use crate::swgl_fns::*; diff --git a/gfx/wr/swgl/src/program.h b/gfx/wr/swgl/src/program.h new file mode 100644 index 0000000000..9ea7c6dd6e --- /dev/null +++ b/gfx/wr/swgl/src/program.h @@ -0,0 +1,186 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +struct VertexAttrib; + +namespace glsl { + +// Type holding group of scalars interpolated across rasterized rows and spans, +// shuttling values between vertex shaders and fragment shaders. +// GCC requires power-of-two vector sizes, so must use glsl type as workaround +// to operate in Float-sized chunks. +typedef vec3 Interpolants; + +// Clip distances, if enabled, are always stored in the first SIMD chunk of the +// interpolants. +static ALWAYS_INLINE Float get_clip_distances(const Interpolants& interp) { + return interp.x; +} + +struct VertexShaderImpl; +struct FragmentShaderImpl; + +struct ProgramImpl { + virtual ~ProgramImpl() {} + virtual int get_uniform(const char* name) const = 0; + virtual void bind_attrib(const char* name, int index) = 0; + virtual int get_attrib(const char* name) const = 0; + virtual size_t interpolants_size() const = 0; + virtual VertexShaderImpl* get_vertex_shader() = 0; + virtual FragmentShaderImpl* get_fragment_shader() = 0; + virtual const char* get_name() const = 0; +}; + +typedef ProgramImpl* (*ProgramLoader)(); + +// The maximum size of the gl_ClipDistance array. +constexpr int32_t gl_MaxClipDistances = 4; + +struct VertexShaderImpl { + typedef void (*SetUniform1iFunc)(VertexShaderImpl*, int index, int value); + typedef void (*SetUniform4fvFunc)(VertexShaderImpl*, int index, + const float* value); + typedef void (*SetUniformMatrix4fvFunc)(VertexShaderImpl*, int index, + const float* value); + typedef void (*InitBatchFunc)(VertexShaderImpl*); + typedef void (*LoadAttribsFunc)(VertexShaderImpl*, VertexAttrib* attribs, + uint32_t start, int instance, int count); + typedef void (*RunPrimitiveFunc)(VertexShaderImpl*, char* interps, + size_t interp_stride); + + SetUniform1iFunc set_uniform_1i_func = nullptr; + SetUniform4fvFunc set_uniform_4fv_func = nullptr; + SetUniformMatrix4fvFunc set_uniform_matrix4fv_func = nullptr; + InitBatchFunc init_batch_func = nullptr; + LoadAttribsFunc load_attribs_func = nullptr; + RunPrimitiveFunc run_primitive_func = nullptr; + + enum FLAGS { + CLIP_DISTANCE = 1 << 0, + }; + int flags = 0; + void enable_clip_distance() { flags |= CLIP_DISTANCE; } + ALWAYS_INLINE bool use_clip_distance() const { + return (flags & CLIP_DISTANCE) != 0; + } + + vec4 gl_Position; + Float gl_ClipDistance[gl_MaxClipDistances]; + + void set_uniform_1i(int index, int value) { + (*set_uniform_1i_func)(this, index, value); + } + + void set_uniform_4fv(int index, const float* value) { + (*set_uniform_4fv_func)(this, index, value); + } + + void set_uniform_matrix4fv(int index, const float* value) { + (*set_uniform_matrix4fv_func)(this, index, value); + } + + void init_batch() { (*init_batch_func)(this); } + + ALWAYS_INLINE void load_attribs(VertexAttrib* attribs, uint32_t start, + int instance, int count) { + (*load_attribs_func)(this, attribs, start, instance, count); + } + + ALWAYS_INLINE void run_primitive(char* interps, size_t interp_stride) { + (*run_primitive_func)(this, interps, interp_stride); + } +}; + +// The number of pixels in a step. +constexpr int32_t swgl_StepSize = 4; + +struct FragmentShaderImpl { + typedef void (*InitSpanFunc)(FragmentShaderImpl*, const void* interps, + const void* step); + typedef void (*RunFunc)(FragmentShaderImpl*); + typedef void (*SkipFunc)(FragmentShaderImpl*, int steps); + typedef void (*InitSpanWFunc)(FragmentShaderImpl*, const void* interps, + const void* step); + typedef void (*RunWFunc)(FragmentShaderImpl*); + typedef void (*SkipWFunc)(FragmentShaderImpl*, int steps); + typedef int (*DrawSpanRGBA8Func)(FragmentShaderImpl*); + typedef int (*DrawSpanR8Func)(FragmentShaderImpl*); + + InitSpanFunc init_span_func = nullptr; + RunFunc run_func = nullptr; + SkipFunc skip_func = nullptr; + InitSpanWFunc init_span_w_func = nullptr; + RunWFunc run_w_func = nullptr; + SkipWFunc skip_w_func = nullptr; + DrawSpanRGBA8Func draw_span_RGBA8_func = nullptr; + DrawSpanR8Func draw_span_R8_func = nullptr; + + enum FLAGS { + DISCARD = 1 << 0, + PERSPECTIVE = 1 << 1, + }; + int flags = 0; + void enable_discard() { flags |= DISCARD; } + void enable_perspective() { flags |= PERSPECTIVE; } + ALWAYS_INLINE bool use_discard() const { return (flags & DISCARD) != 0; } + ALWAYS_INLINE bool use_perspective() const { + return (flags & PERSPECTIVE) != 0; + } + + vec4 gl_FragCoord; + vec4 gl_FragColor; + vec4 gl_SecondaryFragColor; + + vec2_scalar swgl_StepZW; + Bool swgl_IsPixelDiscarded = false; + // The current buffer position for committing span output. + uint32_t* swgl_OutRGBA8 = nullptr; + uint8_t* swgl_OutR8 = nullptr; + // The remaining number of pixels in the span. + int32_t swgl_SpanLength = 0; + + ALWAYS_INLINE void step_fragcoord(int steps = 4) { gl_FragCoord.x += steps; } + + ALWAYS_INLINE void step_perspective(int steps = 4) { + gl_FragCoord.z += swgl_StepZW.x * steps; + gl_FragCoord.w += swgl_StepZW.y * steps; + } + + template + ALWAYS_INLINE void init_span(const void* interps, const void* step) { + (*(W ? init_span_w_func : init_span_func))(this, interps, step); + } + + template + ALWAYS_INLINE void run() { + (*(W ? run_w_func : run_func))(this); + } + + template + ALWAYS_INLINE void skip(int steps = 4) { + (*(W ? skip_w_func : skip_func))(this, steps); + } + + ALWAYS_INLINE int draw_span(uint32_t* buf, int len) { + swgl_OutRGBA8 = buf; + swgl_SpanLength = len; + return (*draw_span_RGBA8_func)(this); + } + + ALWAYS_INLINE bool has_draw_span(uint32_t*) { + return draw_span_RGBA8_func != nullptr; + } + + ALWAYS_INLINE int draw_span(uint8_t* buf, int len) { + swgl_OutR8 = buf; + swgl_SpanLength = len; + return (*draw_span_R8_func)(this); + } + + ALWAYS_INLINE bool has_draw_span(uint8_t*) { + return draw_span_R8_func != nullptr; + } +}; + +} // namespace glsl diff --git a/gfx/wr/swgl/src/rasterize.h b/gfx/wr/swgl/src/rasterize.h new file mode 100644 index 0000000000..a4419c6555 --- /dev/null +++ b/gfx/wr/swgl/src/rasterize.h @@ -0,0 +1,1680 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// The SWGL depth buffer is roughly organized as a span buffer where each row +// of the depth buffer is a list of spans, and each span has a constant depth +// and a run length (represented by DepthRun). The span from start..start+count +// is placed directly at that start index in the row's array of runs, so that +// there is no need to explicitly record the start index at all. This also +// avoids the need to move items around in the run array to manage insertions +// since space is implicitly always available for a run between any two +// pre-existing runs. Linkage from one run to the next is implicitly defined by +// the count, so if a run exists from start..start+count, the next run will +// implicitly pick up right at index start+count where that preceding run left +// off. All of the DepthRun items that are after the head of the run can remain +// uninitialized until the run needs to be split and a new run needs to start +// somewhere in between. +// For uses like perspective-correct rasterization or with a discard mask, a +// run is not an efficient representation, and it is more beneficial to have +// a flattened array of individual depth samples that can be masked off easily. +// To support this case, the first run in a given row's run array may have a +// zero count, signaling that this entire row is flattened. Critically, the +// depth and count fields in DepthRun are ordered (endian-dependently) so that +// the DepthRun struct can be interpreted as a sign-extended int32_t depth. It +// is then possible to just treat the entire row as an array of int32_t depth +// samples that can be processed with SIMD comparisons, since the count field +// behaves as just the sign-extension of the depth field. The count field is +// limited to 8 bits so that we can support depth values up to 24 bits. +// When a depth buffer is cleared, each row is initialized to a maximal runs +// spanning the entire row. In the normal case, the depth buffer will continue +// to manage itself as a list of runs. If perspective or discard is used for +// a given row, the row will be converted to the flattened representation to +// support it, after which it will only ever revert back to runs if the depth +// buffer is cleared. + +// The largest 24-bit depth value supported. +constexpr uint32_t MAX_DEPTH_VALUE = 0xFFFFFF; +// The longest 8-bit depth run that is supported, aligned to SIMD chunk size. +constexpr uint32_t MAX_DEPTH_RUN = 255 & ~3; + +struct DepthRun { + // Ensure that depth always occupies the LSB and count the MSB so that we + // can sign-extend depth just by setting count to zero, marking it flat. + // When count is non-zero, then this is interpreted as an actual run and + // depth is read in isolation. +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint32_t depth : 24; + uint32_t count : 8; +#else + uint32_t count : 8; + uint32_t depth : 24; +#endif + + DepthRun() = default; + DepthRun(uint32_t depth, uint8_t count) : depth(depth), count(count) {} + + // If count is zero, this is actually a flat depth sample rather than a run. + bool is_flat() const { return !count; } + + // Compare a source depth from rasterization with a stored depth value. + template + ALWAYS_INLINE bool compare(uint32_t src) const { + switch (FUNC) { + case GL_LEQUAL: + return src <= depth; + case GL_LESS: + return src < depth; + case GL_ALWAYS: + return true; + default: + assert(false); + return false; + } + } +}; + +// Fills runs at the given position with the given depth up to the span width. +static ALWAYS_INLINE void set_depth_runs(DepthRun* runs, uint32_t depth, + uint32_t width) { + // If the width exceeds the maximum run size, then we need to output clamped + // runs first. + for (; width >= MAX_DEPTH_RUN; + runs += MAX_DEPTH_RUN, width -= MAX_DEPTH_RUN) { + *runs = DepthRun(depth, MAX_DEPTH_RUN); + } + // If there are still any left over samples to fill under the maximum run + // size, then output one last run for them. + if (width > 0) { + *runs = DepthRun(depth, width); + } +} + +// A cursor for reading and modifying a row's depth run array. It locates +// and iterates through a desired span within all the runs, testing if +// the depth of this span passes or fails the depth test against existing +// runs. If desired, new runs may be inserted to represent depth occlusion +// from this span in the run array. +struct DepthCursor { + // Current position of run the cursor has advanced to. + DepthRun* cur = nullptr; + // The start of the remaining potential samples in the desired span. + DepthRun* start = nullptr; + // The end of the potential samples in the desired span. + DepthRun* end = nullptr; + + DepthCursor() = default; + + // Construct a cursor with runs for a given row's run array and the bounds + // of the span we wish to iterate within it. + DepthCursor(DepthRun* runs, int num_runs, int span_offset, int span_count) + : cur(runs), start(&runs[span_offset]), end(start + span_count) { + // This cursor should never iterate over flat runs + assert(!runs->is_flat()); + DepthRun* end_runs = &runs[num_runs]; + // Clamp end of span to end of row + if (end > end_runs) { + end = end_runs; + } + // If the span starts past the end of the row, just advance immediately + // to it to signal that we're done. + if (start >= end_runs) { + cur = end_runs; + start = end_runs; + return; + } + // Otherwise, find the first depth run that contains the start of the span. + // If the span starts after the given run, then we need to keep searching + // through the row to find an appropriate run. The check above already + // guaranteed that the span starts within the row's runs, and the search + // won't fall off the end. + for (;;) { + assert(cur < end); + DepthRun* next = cur + cur->count; + if (start < next) { + break; + } + cur = next; + } + } + + // The cursor is valid if the current position is at the end or if the run + // contains the start position. + bool valid() const { + return cur >= end || (cur <= start && start < cur + cur->count); + } + + // Skip past any initial runs that fail the depth test. If we find a run that + // would pass, then return the accumulated length between where we started + // and that position. Otherwise, if we fall off the end, return -1 to signal + // that there are no more passed runs at the end of this failed region and + // so it is safe for the caller to stop processing any more regions in this + // row. + template + int skip_failed(uint32_t val) { + assert(valid()); + DepthRun* prev = start; + while (cur < end) { + if (cur->compare(val)) { + return start - prev; + } + cur += cur->count; + start = cur; + } + return -1; + } + + // Helper to convert function parameters into template parameters to hoist + // some checks out of inner loops. + ALWAYS_INLINE int skip_failed(uint32_t val, GLenum func) { + switch (func) { + case GL_LEQUAL: + return skip_failed(val); + case GL_LESS: + return skip_failed(val); + default: + assert(false); + return -1; + } + } + + // Find a region of runs that passes the depth test. It is assumed the caller + // has called skip_failed first to skip past any runs that failed the depth + // test. This stops when it finds a run that fails the depth test or we fall + // off the end of the row. If the write mask is enabled, this will insert runs + // to represent this new region that passed the depth test. The length of the + // region is returned. + template + int check_passed(uint32_t val) { + assert(valid()); + DepthRun* prev = cur; + while (cur < end) { + if (!cur->compare(val)) { + break; + } + DepthRun* next = cur + cur->count; + if (next > end) { + if (MASK) { + // Chop the current run where the end of the span falls, making a new + // run from the end of the span till the next run. The beginning of + // the current run will be folded into the run from the start of the + // passed region before returning below. + *end = DepthRun(cur->depth, next - end); + } + // If the next run starts past the end, then just advance the current + // run to the end to signal that we're now at the end of the row. + next = end; + } + cur = next; + } + // If we haven't advanced past the start of the span region, then we found + // nothing that passed. + if (cur <= start) { + return 0; + } + // If 'end' fell within the middle of a passing run, then 'cur' will end up + // pointing at the new partial run created at 'end' where the passing run + // was split to accommodate starting in the middle. The preceding runs will + // be fixed below to properly join with this new split. + int passed = cur - start; + if (MASK) { + // If the search started from a run before the start of the span, then + // edit that run to meet up with the start. + if (prev < start) { + prev->count = start - prev; + } + // Create a new run for the entirety of the passed samples. + set_depth_runs(start, val, passed); + } + start = cur; + return passed; + } + + // Helper to convert function parameters into template parameters to hoist + // some checks out of inner loops. + template + ALWAYS_INLINE int check_passed(uint32_t val, GLenum func) { + switch (func) { + case GL_LEQUAL: + return check_passed(val); + case GL_LESS: + return check_passed(val); + default: + assert(false); + return 0; + } + } + + ALWAYS_INLINE int check_passed(uint32_t val, GLenum func, bool mask) { + return mask ? check_passed(val, func) + : check_passed(val, func); + } + + // Fill a region of runs with a given depth value, bypassing any depth test. + ALWAYS_INLINE void fill(uint32_t depth) { + check_passed(depth); + } +}; + +// Initialize a depth texture by setting the first run in each row to encompass +// the entire row. +void Texture::init_depth_runs(uint32_t depth) { + if (!buf) return; + DepthRun* runs = (DepthRun*)buf; + for (int y = 0; y < height; y++) { + set_depth_runs(runs, depth, width); + runs += stride() / sizeof(DepthRun); + } + set_cleared(true); +} + +// Fill a portion of the run array with flattened depth samples. +static ALWAYS_INLINE void fill_flat_depth(DepthRun* dst, size_t n, + uint32_t depth) { + fill_n((uint32_t*)dst, n, depth); +} + +// Fills a scissored region of a depth texture with a given depth. +void Texture::fill_depth_runs(uint32_t depth, const IntRect& scissor) { + if (!buf) return; + assert(cleared()); + IntRect bb = bounds().intersection(scissor - offset); + DepthRun* runs = (DepthRun*)sample_ptr(0, bb.y0); + for (int rows = bb.height(); rows > 0; rows--) { + if (bb.width() >= width) { + // If the scissor region encompasses the entire row, reset the row to a + // single run encompassing the entire row. + set_depth_runs(runs, depth, width); + } else if (runs->is_flat()) { + // If the row is flattened, just directly fill the portion of the row. + fill_flat_depth(&runs[bb.x0], bb.width(), depth); + } else { + // Otherwise, if we are still using runs, then set up a cursor to fill + // it with depth runs. + DepthCursor(runs, width, bb.x0, bb.width()).fill(depth); + } + runs += stride() / sizeof(DepthRun); + } +} + +using ZMask = I32; + +#if USE_SSE2 +# define ZMASK_NONE_PASSED 0xFFFF +# define ZMASK_ALL_PASSED 0 +static inline uint32_t zmask_code(ZMask mask) { + return _mm_movemask_epi8(mask); +} +#else +# define ZMASK_NONE_PASSED 0xFFFFFFFFU +# define ZMASK_ALL_PASSED 0 +static inline uint32_t zmask_code(ZMask mask) { + return bit_cast(CONVERT(mask, U8)); +} +#endif + +// Interprets items in the depth buffer as sign-extended 32-bit depth values +// instead of as runs. Returns a mask that signals which samples in the given +// chunk passed or failed the depth test with given Z value. +template +static ALWAYS_INLINE bool check_depth(I32 src, DepthRun* zbuf, ZMask& outmask, + int span = 4) { + // SSE2 does not support unsigned comparison. So ensure Z value is + // sign-extended to int32_t. + I32 dest = unaligned_load(zbuf); + // Invert the depth test to check which pixels failed and should be discarded. + ZMask mask = ctx->depthfunc == GL_LEQUAL + ? + // GL_LEQUAL: Not(LessEqual) = Greater + ZMask(src > dest) + : + // GL_LESS: Not(Less) = GreaterEqual + ZMask(src >= dest); + // Mask off any unused lanes in the span. + mask |= ZMask(span) < ZMask{1, 2, 3, 4}; + if (zmask_code(mask) == ZMASK_NONE_PASSED) { + return false; + } + if (!DISCARD && ctx->depthmask) { + unaligned_store(zbuf, (mask & dest) | (~mask & src)); + } + outmask = mask; + return true; +} + +static ALWAYS_INLINE I32 packDepth() { + return cast(fragment_shader->gl_FragCoord.z * MAX_DEPTH_VALUE); +} + +static ALWAYS_INLINE void discard_depth(I32 src, DepthRun* zbuf, I32 mask) { + if (ctx->depthmask) { + I32 dest = unaligned_load(zbuf); + mask |= fragment_shader->swgl_IsPixelDiscarded; + unaligned_store(zbuf, (mask & dest) | (~mask & src)); + } +} + +static ALWAYS_INLINE void mask_output(uint32_t* buf, ZMask zmask, + int span = 4) { + WideRGBA8 r = pack_pixels_RGBA8(); + PackedRGBA8 dst = load_span(buf, span); + if (blend_key) r = blend_pixels(buf, dst, r, span); + PackedRGBA8 mask = bit_cast(zmask); + store_span(buf, (mask & dst) | (~mask & pack(r)), span); +} + +template +static ALWAYS_INLINE void discard_output(uint32_t* buf, int span = 4) { + mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span); +} + +template <> +ALWAYS_INLINE void discard_output(uint32_t* buf, int span) { + WideRGBA8 r = pack_pixels_RGBA8(); + if (blend_key) + r = blend_pixels(buf, load_span(buf, span), r, span); + store_span(buf, pack(r), span); +} + +static ALWAYS_INLINE void mask_output(uint8_t* buf, ZMask zmask, int span = 4) { + WideR8 r = pack_pixels_R8(); + WideR8 dst = unpack(load_span(buf, span)); + if (blend_key) r = blend_pixels(buf, dst, r, span); + WideR8 mask = packR8(zmask); + store_span(buf, pack((mask & dst) | (~mask & r)), span); +} + +template +static ALWAYS_INLINE void discard_output(uint8_t* buf, int span = 4) { + mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span); +} + +template <> +ALWAYS_INLINE void discard_output(uint8_t* buf, int span) { + WideR8 r = pack_pixels_R8(); + if (blend_key) + r = blend_pixels(buf, unpack(load_span(buf, span)), r, span); + store_span(buf, pack(r), span); +} + +struct ClipRect { + float x0; + float y0; + float x1; + float y1; + + explicit ClipRect(const IntRect& i) + : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {} + explicit ClipRect(const Texture& t) : ClipRect(ctx->apply_scissor(t)) { + // If blending is enabled, set blend_key to reflect the resolved blend + // state for the currently drawn primitive. + if (ctx->blend) { + blend_key = ctx->blend_key; + if (swgl_ClipFlags) { + // If there is a blend override set, replace the blend key with it. + if (swgl_ClipFlags & SWGL_CLIP_FLAG_BLEND_OVERRIDE) { + blend_key = swgl_BlendOverride; + } + // If a clip mask is available, set up blending state to use the clip + // mask. + if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { + assert(swgl_ClipMask->format == TextureFormat::R8); + // Constrain the clip mask bounds to always fall within the clip mask. + swgl_ClipMaskBounds.intersect(IntRect{0, 0, int(swgl_ClipMask->width), + int(swgl_ClipMask->height)}); + // The clip mask offset is relative to the viewport. + swgl_ClipMaskOffset += ctx->viewport.origin() - t.offset; + // The clip mask bounds are relative to the clip mask offset. + swgl_ClipMaskBounds.offset(swgl_ClipMaskOffset); + // Finally, constrain the clip rectangle by the clip mask bounds. + intersect(swgl_ClipMaskBounds); + // Modify the blend key so that it will use the clip mask while + // blending. + restore_clip_mask(); + } + if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { + // Modify the blend key so that it will use AA while blending. + restore_aa(); + } + } + } else { + blend_key = BLEND_KEY_NONE; + swgl_ClipFlags = 0; + } + } + + FloatRange x_range() const { return {x0, x1}; } + + void intersect(const IntRect& c) { + x0 = max(x0, float(c.x0)); + y0 = max(y0, float(c.y0)); + x1 = min(x1, float(c.x1)); + y1 = min(y1, float(c.y1)); + } + + template + void set_clip_mask(int x, int y, P* buf) const { + if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { + swgl_SpanBuf = buf; + swgl_ClipMaskBuf = (uint8_t*)swgl_ClipMask->buf + + (y - swgl_ClipMaskOffset.y) * swgl_ClipMask->stride + + (x - swgl_ClipMaskOffset.x); + } + } + + template + bool overlaps(int nump, const P* p) const { + // Generate a mask of which side of the clip rect all of a polygon's points + // fall inside of. This is a cheap conservative estimate of whether the + // bounding box of the polygon might overlap the clip rect, rather than an + // exact test that would require multiple slower line intersections. + int sides = 0; + for (int i = 0; i < nump; i++) { + sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2; + sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8; + } + return sides == 0xF; + } +}; + +// Given a current X position at the center Y position of a row, return the X +// position of the left and right intercepts of the row top and bottom. +template +static ALWAYS_INLINE FloatRange x_intercepts(const E& e) { + float rad = 0.5f * abs(e.x_slope()); + return {e.cur_x() - rad, e.cur_x() + rad}; +} + +// Return the AA sub-span corresponding to a given edge. If AA is requested, +// then this finds the X intercepts with the row clipped into range of the +// edge and finally conservatively rounds them out. If there is no AA, then +// it just returns the current rounded X position clipped within bounds. +template +static ALWAYS_INLINE IntRange aa_edge(const E& e, const FloatRange& bounds) { + return e.edgeMask ? bounds.clip(x_intercepts(e)).round_out() + : bounds.clip({e.cur_x(), e.cur_x()}).round(); +} + +// Calculate the initial AA coverage as an approximation of the distance from +// the center of the pixel in the direction of the edge slope. Given an edge +// (x,y)..(x+dx,y+dy), then the normalized tangent vector along the edge is +// (dx,dy)/sqrt(dx^2+dy^2). We know that for dy=1 then dx=e.x_slope. We rotate +// the tangent vector either -90 or +90 degrees to get the edge normal vector, +// where 'dx=-dy and 'dy=dx. Once normalized by 1/sqrt(dx^2+dy^2), scale into +// the range of 0..256 so that we can cheaply convert to a fixed-point scale +// factor. It is assumed that at exactly the pixel center the opacity is half +// (128) and linearly decreases along the normal vector at 1:1 scale with the +// slope. While not entirely accurate, this gives a reasonably agreeable looking +// approximation of AA. For edges on which there is no AA, just force the +// opacity to maximum (256) with no slope, relying on the span clipping to trim +// pixels outside the span. +template +static ALWAYS_INLINE FloatRange aa_dist(const E& e, float dir) { + if (e.edgeMask) { + float dx = (dir * 256.0f) * inversesqrt(1.0f + e.x_slope() * e.x_slope()); + return {128.0f + dx * (e.cur_x() - 0.5f), -dx}; + } else { + return {256.0f, 0.0f}; + } +} + +template +static ALWAYS_INLINE IntRange aa_span(P* buf, const E& left, const E& right, + const FloatRange& bounds) { + // If there is no AA, just return the span from the rounded left edge X + // position to the rounded right edge X position. Clip the span to be within + // the valid bounds. + if (!(swgl_ClipFlags & SWGL_CLIP_FLAG_AA)) { + return bounds.clip({left.cur_x(), right.cur_x()}).round(); + } + + // Calculate the left and right AA spans along with the coverage distances + // and slopes necessary to do blending. + IntRange leftAA = aa_edge(left, bounds); + FloatRange leftDist = aa_dist(left, -1.0f); + IntRange rightAA = aa_edge(right, bounds); + FloatRange rightDist = aa_dist(right, 1.0f); + + // Use the pointer into the destination buffer as a status indicator of the + // coverage offset. The pointer is calculated so that subtracting it with + // the current destination pointer will yield a negative value if the span + // is outside the opaque area and otherwise will yield a positive value + // above the opaque size. This pointer is stored as a uint8 pointer so that + // there are no hidden multiplication instructions and will just return a + // 1:1 linear memory address. Thus the size of the opaque region must also + // be scaled by the pixel size in bytes. + swgl_OpaqueStart = (const uint8_t*)(buf + leftAA.end); + swgl_OpaqueSize = max(rightAA.start - leftAA.end - 3, 0) * sizeof(P); + + // Offset the coverage distances by the end of the left AA span, which + // corresponds to the opaque start pointer, so that pixels become opaque + // immediately after. The distances are also offset for each lane in the + // chunk. + Float offset = cast(leftAA.end + (I32){0, 1, 2, 3}); + swgl_LeftAADist = leftDist.start + offset * leftDist.end; + swgl_RightAADist = rightDist.start + offset * rightDist.end; + swgl_AASlope = + (Float){leftDist.end, rightDist.end, 0.0f, 0.0f} / float(sizeof(P)); + + // Return the full span width from the start of the left span to the end of + // the right span. + return {leftAA.start, rightAA.end}; +} + +// Calculate the span the user clip distances occupy from the left and right +// edges at the current row. +template +static ALWAYS_INLINE IntRange clip_distance_range(const E& left, + const E& right) { + Float leftClip = get_clip_distances(left.interp); + Float rightClip = get_clip_distances(right.interp); + // Get the change in clip dist per X step. + Float clipStep = (rightClip - leftClip) / (right.cur_x() - left.cur_x()); + // Find the zero intercepts starting from the left edge. + Float clipDist = + clamp(left.cur_x() - leftClip * recip(clipStep), 0.0f, 1.0e6f); + // Find the distance to the start of the span for any clip distances that + // are increasing in value. If the clip distance is constant or decreasing + // in value, then check if it starts outside the clip volume. + Float start = if_then_else(clipStep > 0.0f, clipDist, + if_then_else(leftClip < 0.0f, 1.0e6f, 0.0f)); + // Find the distance to the end of the span for any clip distances that are + // decreasing in value. If the clip distance is constant or increasing in + // value, then check if it ends inside the clip volume. + Float end = if_then_else(clipStep < 0.0f, clipDist, + if_then_else(rightClip >= 0.0f, 1.0e6f, 0.0f)); + // Find the furthest start offset. + start = max(start, start.zwxy); + // Find the closest end offset. + end = min(end, end.zwxy); + // Finally, round the offsets to an integer span that can be used to bound + // the current span. + return FloatRange{max(start.x, start.y), min(end.x, end.y)}.round(); +} + +// Converts a run array into a flattened array of depth samples. This just +// walks through every run and fills the samples with the depth value from +// the run. +static void flatten_depth_runs(DepthRun* runs, size_t width) { + if (runs->is_flat()) { + return; + } + while (width > 0) { + size_t n = runs->count; + fill_flat_depth(runs, n, runs->depth); + runs += n; + width -= n; + } +} + +// Helper function for drawing passed depth runs within the depth buffer. +// Flattened depth (perspective or discard) is not supported. +template +static ALWAYS_INLINE void draw_depth_span(uint32_t z, P* buf, + DepthCursor& cursor) { + for (;;) { + // Get the span that passes the depth test. Assume on entry that + // any failed runs have already been skipped. + int span = cursor.check_passed(z, ctx->depthfunc, ctx->depthmask); + // If nothing passed, since we already skipped passed failed runs + // previously, we must have hit the end of the row. Bail out. + if (span <= 0) { + break; + } + if (span >= 4) { + // If we have a draw specialization, try to process as many 4-pixel + // chunks as possible using it. + if (fragment_shader->has_draw_span(buf)) { + int drawn = fragment_shader->draw_span(buf, span & ~3); + buf += drawn; + span -= drawn; + } + // Otherwise, just process each chunk individually. + while (span >= 4) { + fragment_shader->run(); + discard_output(buf); + buf += 4; + span -= 4; + } + } + // If we have a partial chunk left over, we still have to process it as if + // it were a full chunk. Mask off only the part of the chunk we want to + // use. + if (span > 0) { + fragment_shader->run(); + discard_output(buf, span); + buf += span; + } + // Skip past any runs that fail the depth test. + int skip = cursor.skip_failed(z, ctx->depthfunc); + // If there aren't any, that means we won't encounter any more passing runs + // and so it's safe to bail out. + if (skip <= 0) { + break; + } + // Advance interpolants for the fragment shader past the skipped region. + // If we processed a partial chunk above, we actually advanced the + // interpolants a full chunk in the fragment shader's run function. Thus, + // we need to first subtract off that 4-pixel chunk and only partially + // advance them to that partial chunk before we can add on the rest of the + // skips. This is combined with the skip here for efficiency's sake. + fragment_shader->skip(skip - (span > 0 ? 4 - span : 0)); + buf += skip; + } +} + +// Draw a simple span in 4-pixel wide chunks, optionally using depth. +template +static ALWAYS_INLINE void draw_span(P* buf, DepthRun* depth, int span, Z z) { + if (depth) { + // Depth testing is enabled. If perspective is used, Z values will vary + // across the span, we use packDepth to generate packed Z values suitable + // for depth testing based on current values from gl_FragCoord.z. + // Otherwise, for the no-perspective case, we just use the provided Z. + // Process 4-pixel chunks first. + for (; span >= 4; span -= 4, buf += 4, depth += 4) { + I32 zsrc = z(); + ZMask zmask; + if (check_depth(zsrc, depth, zmask)) { + fragment_shader->run(); + mask_output(buf, zmask); + if (DISCARD) discard_depth(zsrc, depth, zmask); + } else { + fragment_shader->skip(); + } + } + // If there are any remaining pixels, do a partial chunk. + if (span > 0) { + I32 zsrc = z(); + ZMask zmask; + if (check_depth(zsrc, depth, zmask, span)) { + fragment_shader->run(); + mask_output(buf, zmask, span); + if (DISCARD) discard_depth(zsrc, depth, zmask); + } + } + } else { + // Process 4-pixel chunks first. + for (; span >= 4; span -= 4, buf += 4) { + fragment_shader->run(); + discard_output(buf); + } + // If there are any remaining pixels, do a partial chunk. + if (span > 0) { + fragment_shader->run(); + discard_output(buf, span); + } + } +} + +// Called during rasterization to forcefully clear a row on which delayed clear +// has been enabled. If we know that we are going to completely overwrite a part +// of the row, then we only need to clear the row outside of that part. However, +// if blending or discard is enabled, the values of that underlying part of the +// row may be used regardless to produce the final rasterization result, so we +// have to then clear the entire underlying row to prepare it. +template +static inline void prepare_row(Texture& colortex, int y, int startx, int endx, + bool use_discard, DepthRun* depth, + uint32_t z = 0, DepthCursor* cursor = nullptr) { + assert(colortex.delay_clear > 0); + // Delayed clear is enabled for the color buffer. Check if needs clear. + uint32_t& mask = colortex.cleared_rows[y / 32]; + if ((mask & (1 << (y & 31))) == 0) { + mask |= 1 << (y & 31); + colortex.delay_clear--; + if (blend_key || use_discard) { + // If depth test, blending, or discard is used, old color values + // might be sampled, so we need to clear the entire row to fill it. + force_clear_row

(colortex, y); + } else if (depth) { + if (depth->is_flat() || !cursor) { + // If flat depth is used, we can't cheaply predict if which samples will + // pass. + force_clear_row

(colortex, y); + } else { + // Otherwise if depth runs are used, see how many samples initially pass + // the depth test and only fill the row outside those. The fragment + // shader will fill the row within the passed samples. + int passed = + DepthCursor(*cursor).check_passed(z, ctx->depthfunc); + if (startx > 0 || startx + passed < colortex.width) { + force_clear_row

(colortex, y, startx, startx + passed); + } + } + } else if (startx > 0 || endx < colortex.width) { + // Otherwise, we only need to clear the row outside of the span. + // The fragment shader will fill the row within the span itself. + force_clear_row

(colortex, y, startx, endx); + } + } +} + +// Perpendicular dot-product is the dot-product of a vector with the +// perpendicular vector of the other, i.e. dot(a, {-b.y, b.x}) +template +static ALWAYS_INLINE auto perpDot(T a, T b) { + return a.x * b.y - a.y * b.x; +} + +// Check if the winding of the initial edges is flipped, requiring us to swap +// the edges to avoid spans having negative lengths. Assume that l0.y == r0.y +// due to the initial edge scan in draw_quad/perspective_spans. +template +static ALWAYS_INLINE bool checkIfEdgesFlipped(T l0, T l1, T r0, T r1) { + // If the starting point of the left edge is to the right of the starting + // point of the right edge, then just assume the edges are flipped. If the + // left and right starting points are the same, then check the sign of the + // cross-product of the edges to see if the edges are flipped. Otherwise, + // if the left starting point is actually just to the left of the right + // starting point, then assume no edge flip. + return l0.x > r0.x || (l0.x == r0.x && perpDot(l1 - l0, r1 - r0) > 0.0f); +} + +// Draw spans for each row of a given quad (or triangle) with a constant Z +// value. The quad is assumed convex. It is clipped to fall within the given +// clip rect. In short, this function rasterizes a quad by first finding a +// top most starting point and then from there tracing down the left and right +// sides of this quad until it hits the bottom, outputting a span between the +// current left and right positions at each row along the way. Points are +// assumed to be ordered in either CW or CCW to support this, but currently +// both orders (CW and CCW) are supported and equivalent. +template +static inline void draw_quad_spans(int nump, Point2D p[4], uint32_t z, + Interpolants interp_outs[4], + Texture& colortex, Texture& depthtex, + const ClipRect& clipRect) { + // Only triangles and convex quads supported. + assert(nump == 3 || nump == 4); + + Point2D l0, r0, l1, r1; + int l0i, r0i, l1i, r1i; + { + // Find the index of the top-most (smallest Y) point from which + // rasterization can start. + int top = nump > 3 && p[3].y < p[2].y + ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3) + : (p[1].y < p[3].y ? 1 : 3)) + : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2) + : (p[1].y < p[2].y ? 1 : 2)); + // Helper to find next index in the points array, walking forward. +#define NEXT_POINT(idx) \ + ({ \ + int cur = (idx) + 1; \ + cur < nump ? cur : 0; \ + }) + // Helper to find the previous index in the points array, walking backward. +#define PREV_POINT(idx) \ + ({ \ + int cur = (idx)-1; \ + cur >= 0 ? cur : nump - 1; \ + }) + // Start looking for "left"-side and "right"-side descending edges starting + // from the determined top point. + int next = NEXT_POINT(top); + int prev = PREV_POINT(top); + if (p[top].y == p[next].y) { + // If the next point is on the same row as the top, then advance one more + // time to the next point and use that as the "left" descending edge. + l0i = next; + l1i = NEXT_POINT(next); + // Assume top and prev form a descending "right" edge, as otherwise this + // will be a collapsed polygon and harmlessly bail out down below. + r0i = top; + r1i = prev; + } else if (p[top].y == p[prev].y) { + // If the prev point is on the same row as the top, then advance to the + // prev again and use that as the "right" descending edge. + // Assume top and next form a non-empty descending "left" edge. + l0i = top; + l1i = next; + r0i = prev; + r1i = PREV_POINT(prev); + } else { + // Both next and prev are on distinct rows from top, so both "left" and + // "right" edges are non-empty/descending. + l0i = r0i = top; + l1i = next; + r1i = prev; + } + // Load the points from the indices. + l0 = p[l0i]; // Start of left edge + r0 = p[r0i]; // End of left edge + l1 = p[l1i]; // Start of right edge + r1 = p[r1i]; // End of right edge + // debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1: + // %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i, + // r1.x, r1.y); + } + + struct Edge { + float yScale; + float xSlope; + float x; + Interpolants interpSlope; + Interpolants interp; + bool edgeMask; + + Edge(float y, const Point2D& p0, const Point2D& p1, const Interpolants& i0, + const Interpolants& i1, int edgeIndex) + : // Inverse Y scale for slope calculations. Avoid divide on 0-length + // edge. Later checks below ensure that Y <= p1.y, or otherwise we + // don't use this edge. We just need to guard against Y == p1.y == + // p0.y. In that case, Y - p0.y == 0 and will cancel out the slopes + // below, except if yScale is Inf for some reason (or worse, NaN), + // which 1/(p1.y-p0.y) might produce if we don't bound it. + yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), + // Calculate dX/dY slope + xSlope((p1.x - p0.x) * yScale), + // Initialize current X based on Y and slope + x(p0.x + (y - p0.y) * xSlope), + // Calculate change in interpolants per change in Y + interpSlope((i1 - i0) * yScale), + // Initialize current interpolants based on Y and slope + interp(i0 + (y - p0.y) * interpSlope), + // Extract the edge mask status for this edge + edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {} + + void nextRow() { + // step current X and interpolants to next row from slope + x += xSlope; + interp += interpSlope; + } + + float cur_x() const { return x; } + float x_slope() const { return xSlope; } + }; + + // Vertex selection above should result in equal left and right start rows + assert(l0.y == r0.y); + // Find the start y, clip to within the clip rect, and round to row center. + // If AA is enabled, round out conservatively rather than round to nearest. + float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f; + float y = floor(max(min(l0.y, clipRect.y1), clipRect.y0) + aaRound) + 0.5f; + // Initialize left and right edges from end points and start Y + Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); + Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); + // WR does not use backface culling, so check if edges are flipped. + bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1); + if (flipped) swap(left, right); + // Get pointer to color buffer and depth buffer at current Y + P* fbuf = (P*)colortex.sample_ptr(0, int(y)); + DepthRun* fdepth = depthtex.buf != nullptr + ? (DepthRun*)depthtex.sample_ptr(0, int(y)) + : nullptr; + // Loop along advancing Ys, rasterizing spans at each row + float checkY = min(min(l1.y, r1.y), clipRect.y1); + // Ensure we don't rasterize out edge bounds + FloatRange clipSpan = + clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); + for (;;) { + // Check if we maybe passed edge ends or outside clip rect... + if (y > checkY) { + // If we're outside the clip rect, we're done. + if (y > clipRect.y1) break; + // Helper to find the next non-duplicate vertex that doesn't loop back. +#define STEP_EDGE(y, e0i, e0, e1i, e1, STEP_POINT, end) \ + do { \ + /* Set new start of edge to be end of old edge */ \ + e0i = e1i; \ + e0 = e1; \ + /* Set new end of edge to next point */ \ + e1i = STEP_POINT(e1i); \ + e1 = p[e1i]; \ + /* If the edge crossed the end, we're done. */ \ + if (e0i == end) return; \ + /* Otherwise, it doesn't advance, so keep searching. */ \ + } while (y > e1.y) + // Check if Y advanced past the end of the left edge + if (y > l1.y) { + // Step to next left edge past Y and reset edge interpolants. + STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i); + (flipped ? right : left) = + Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); + } + // Check if Y advanced past the end of the right edge + if (y > r1.y) { + // Step to next right edge past Y and reset edge interpolants. + STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i); + (flipped ? left : right) = + Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); + } + // Reset the clip bounds for the new edges + clipSpan = + clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); + // Reset check condition for next time around. + checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1); + } + + // Calculate a potentially AA'd span and check if it is non-empty. + IntRange span = aa_span(fbuf, left, right, clipSpan); + if (span.len() > 0) { + // If user clip planes are enabled, use them to bound the current span. + if (vertex_shader->use_clip_distance()) { + span = span.intersect(clip_distance_range(left, right)); + if (span.len() <= 0) goto next_span; + } + ctx->shaded_rows++; + ctx->shaded_pixels += span.len(); + // Advance color/depth buffer pointers to the start of the span. + P* buf = fbuf + span.start; + // Check if we will need to use depth-buffer or discard on this span. + DepthRun* depth = + depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr; + DepthCursor cursor; + bool use_discard = fragment_shader->use_discard(); + if (use_discard) { + if (depth) { + // If we're using discard, we may have to unpredictably drop out some + // samples. Flatten the depth run array here to allow this. + if (!depth->is_flat()) { + flatten_depth_runs(depth, depthtex.width); + } + // Advance to the depth sample at the start of the span. + depth += span.start; + } + } else if (depth) { + if (!depth->is_flat()) { + // We're not using discard and the depth row is still organized into + // runs. Skip past any runs that would fail the depth test so we + // don't have to do any extra work to process them with the rest of + // the span. + cursor = DepthCursor(depth, depthtex.width, span.start, span.len()); + int skipped = cursor.skip_failed(z, ctx->depthfunc); + // If we fell off the row, that means we couldn't find any passing + // runs. We can just skip the entire span. + if (skipped < 0) { + goto next_span; + } + buf += skipped; + span.start += skipped; + } else { + // The row is already flattened, so just advance to the span start. + depth += span.start; + } + } + + if (colortex.delay_clear) { + // Delayed clear is enabled for the color buffer. Check if needs clear. + prepare_row

(colortex, int(y), span.start, span.end, use_discard, + depth, z, &cursor); + } + + // Initialize fragment shader interpolants to current span position. + fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1); + fragment_shader->gl_FragCoord.y = y; + { + // Change in interpolants is difference between current right and left + // edges per the change in right and left X. If the left and right X + // positions are extremely close together, then avoid stepping the + // interpolants. + float stepScale = 1.0f / (right.x - left.x); + if (!isfinite(stepScale)) stepScale = 0.0f; + Interpolants step = (right.interp - left.interp) * stepScale; + // Advance current interpolants to X at start of span. + Interpolants o = left.interp + step * (span.start + 0.5f - left.x); + fragment_shader->init_span(&o, &step); + } + clipRect.set_clip_mask(span.start, y, buf); + if (!use_discard) { + // Fast paths for the case where fragment discard is not used. + if (depth) { + // If depth is used, we want to process entire depth runs if depth is + // not flattened. + if (!depth->is_flat()) { + draw_depth_span(z, buf, cursor); + goto next_span; + } + // Otherwise, flattened depth must fall back to the slightly slower + // per-chunk depth test path in draw_span below. + } else { + // Check if the fragment shader has an optimized draw specialization. + if (span.len() >= 4 && fragment_shader->has_draw_span(buf)) { + // Draw specialization expects 4-pixel chunks. + int drawn = fragment_shader->draw_span(buf, span.len() & ~3); + buf += drawn; + span.start += drawn; + } + } + draw_span(buf, depth, span.len(), [=] { return z; }); + } else { + // If discard is used, then use slower fallbacks. This should be rare. + // Just needs to work, doesn't need to be too fast yet... + draw_span(buf, depth, span.len(), [=] { return z; }); + } + } + next_span: + // Advance Y and edge interpolants to next row. + y++; + left.nextRow(); + right.nextRow(); + // Advance buffers to next row. + fbuf += colortex.stride() / sizeof(P); + fdepth += depthtex.stride() / sizeof(DepthRun); + } +} + +// Draw perspective-correct spans for a convex quad that has been clipped to +// the near and far Z planes, possibly producing a clipped convex polygon with +// more than 4 sides. This assumes the Z value will vary across the spans and +// requires interpolants to factor in W values. This tends to be slower than +// the simpler 2D draw_quad_spans above, especially since we can't optimize the +// depth test easily when Z values, and should be used only rarely if possible. +template +static inline void draw_perspective_spans(int nump, Point3D* p, + Interpolants* interp_outs, + Texture& colortex, Texture& depthtex, + const ClipRect& clipRect) { + Point3D l0, r0, l1, r1; + int l0i, r0i, l1i, r1i; + { + // Find the index of the top-most point (smallest Y) from which + // rasterization can start. + int top = 0; + for (int i = 1; i < nump; i++) { + if (p[i].y < p[top].y) { + top = i; + } + } + // Find left-most top point, the start of the left descending edge. + // Advance forward in the points array, searching at most nump points + // in case the polygon is flat. + l0i = top; + for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) { + l0i = i; + } + if (l0i == nump - 1) { + for (int i = 0; i <= top && p[i].y == p[top].y; i++) { + l0i = i; + } + } + // Find right-most top point, the start of the right descending edge. + // Advance backward in the points array, searching at most nump points. + r0i = top; + for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) { + r0i = i; + } + if (r0i == 0) { + for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) { + r0i = i; + } + } + // End of left edge is next point after left edge start. + l1i = NEXT_POINT(l0i); + // End of right edge is prev point after right edge start. + r1i = PREV_POINT(r0i); + l0 = p[l0i]; // Start of left edge + r0 = p[r0i]; // End of left edge + l1 = p[l1i]; // Start of right edge + r1 = p[r1i]; // End of right edge + } + + struct Edge { + float yScale; + // Current coordinates for edge. Where in the 2D case of draw_quad_spans, + // it is enough to just track the X coordinate as we advance along the rows, + // for the perspective case we also need to keep track of Z and W. For + // simplicity, we just use the full 3D point to track all these coordinates. + Point3D pSlope; + Point3D p; + Interpolants interpSlope; + Interpolants interp; + bool edgeMask; + + Edge(float y, const Point3D& p0, const Point3D& p1, const Interpolants& i0, + const Interpolants& i1, int edgeIndex) + : // Inverse Y scale for slope calculations. Avoid divide on 0-length + // edge. + yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), + // Calculate dX/dY slope + pSlope((p1 - p0) * yScale), + // Initialize current coords based on Y and slope + p(p0 + (y - p0.y) * pSlope), + // Crucially, these interpolants must be scaled by the point's 1/w + // value, which allows linear interpolation in a perspective-correct + // manner. This will be canceled out inside the fragment shader later. + // Calculate change in interpolants per change in Y + interpSlope((i1 * p1.w - i0 * p0.w) * yScale), + // Initialize current interpolants based on Y and slope + interp(i0 * p0.w + (y - p0.y) * interpSlope), + // Extract the edge mask status for this edge + edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {} + + float x() const { return p.x; } + vec2_scalar zw() const { return {p.z, p.w}; } + + void nextRow() { + // step current coords and interpolants to next row from slope + p += pSlope; + interp += interpSlope; + } + + float cur_x() const { return p.x; } + float x_slope() const { return pSlope.x; } + }; + + // Vertex selection above should result in equal left and right start rows + assert(l0.y == r0.y); + // Find the start y, clip to within the clip rect, and round to row center. + // If AA is enabled, round out conservatively rather than round to nearest. + float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f; + float y = floor(max(min(l0.y, clipRect.y1), clipRect.y0) + aaRound) + 0.5f; + // Initialize left and right edges from end points and start Y + Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); + Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); + // WR does not use backface culling, so check if edges are flipped. + bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1); + if (flipped) swap(left, right); + // Get pointer to color buffer and depth buffer at current Y + P* fbuf = (P*)colortex.sample_ptr(0, int(y)); + DepthRun* fdepth = depthtex.buf != nullptr + ? (DepthRun*)depthtex.sample_ptr(0, int(y)) + : nullptr; + // Loop along advancing Ys, rasterizing spans at each row + float checkY = min(min(l1.y, r1.y), clipRect.y1); + // Ensure we don't rasterize out edge bounds + FloatRange clipSpan = + clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); + for (;;) { + // Check if we maybe passed edge ends or outside clip rect... + if (y > checkY) { + // If we're outside the clip rect, we're done. + if (y > clipRect.y1) break; + // Check if Y advanced past the end of the left edge + if (y > l1.y) { + // Step to next left edge past Y and reset edge interpolants. + STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i); + (flipped ? right : left) = + Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); + } + // Check if Y advanced past the end of the right edge + if (y > r1.y) { + // Step to next right edge past Y and reset edge interpolants. + STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i); + (flipped ? left : right) = + Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); + } + // Reset the clip bounds for the new edges + clipSpan = + clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); + // Reset check condition for next time around. + checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1); + } + + // Calculate a potentially AA'd span and check if it is non-empty. + IntRange span = aa_span(fbuf, left, right, clipSpan); + if (span.len() > 0) { + // If user clip planes are enabled, use them to bound the current span. + if (vertex_shader->use_clip_distance()) { + span = span.intersect(clip_distance_range(left, right)); + if (span.len() <= 0) goto next_span; + } + ctx->shaded_rows++; + ctx->shaded_pixels += span.len(); + // Advance color/depth buffer pointers to the start of the span. + P* buf = fbuf + span.start; + // Check if the we will need to use depth-buffer or discard on this span. + DepthRun* depth = + depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr; + bool use_discard = fragment_shader->use_discard(); + if (depth) { + // Perspective may cause the depth value to vary on a per sample basis. + // Ensure the depth row is flattened to allow testing of individual + // samples + if (!depth->is_flat()) { + flatten_depth_runs(depth, depthtex.width); + } + // Advance to the depth sample at the start of the span. + depth += span.start; + } + if (colortex.delay_clear) { + // Delayed clear is enabled for the color buffer. Check if needs clear. + prepare_row

(colortex, int(y), span.start, span.end, use_discard, + depth); + } + // Initialize fragment shader interpolants to current span position. + fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1); + fragment_shader->gl_FragCoord.y = y; + { + // Calculate the fragment Z and W change per change in fragment X step. + // If the left and right X positions are extremely close together, then + // avoid stepping. + float stepScale = 1.0f / (right.x() - left.x()); + if (!isfinite(stepScale)) stepScale = 0.0f; + vec2_scalar stepZW = (right.zw() - left.zw()) * stepScale; + // Calculate initial Z and W values for span start. + vec2_scalar zw = left.zw() + stepZW * (span.start + 0.5f - left.x()); + // Set fragment shader's Z and W values so that it can use them to + // cancel out the 1/w baked into the interpolants. + fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x); + fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y); + fragment_shader->swgl_StepZW = stepZW; + // Change in interpolants is difference between current right and left + // edges per the change in right and left X. The left and right + // interpolant values were previously multipled by 1/w, so the step and + // initial span values take this into account. + Interpolants step = (right.interp - left.interp) * stepScale; + // Advance current interpolants to X at start of span. + Interpolants o = left.interp + step * (span.start + 0.5f - left.x()); + fragment_shader->init_span(&o, &step); + } + clipRect.set_clip_mask(span.start, y, buf); + if (!use_discard) { + // No discard is used. Common case. + draw_span(buf, depth, span.len(), packDepth); + } else { + // Discard is used. Rare. + draw_span(buf, depth, span.len(), packDepth); + } + } + next_span: + // Advance Y and edge interpolants to next row. + y++; + left.nextRow(); + right.nextRow(); + // Advance buffers to next row. + fbuf += colortex.stride() / sizeof(P); + fdepth += depthtex.stride() / sizeof(DepthRun); + } +} + +// Clip a primitive against both sides of a view-frustum axis, producing +// intermediate vertexes with interpolated attributes that will no longer +// intersect the selected axis planes. This assumes the primitive is convex +// and should produce at most N+2 vertexes for each invocation (only in the +// worst case where one point falls outside on each of the opposite sides +// with the rest of the points inside). The supplied AA edge mask will be +// modified such that it corresponds to the clipped polygon edges. +template +static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP, + Interpolants* outInterp, int& outEdgeMask) { + // Potential mask bits of which side of a plane a coordinate falls on. + enum SIDE { POSITIVE = 1, NEGATIVE = 2 }; + int numClip = 0; + int edgeMask = outEdgeMask; + Point3D prev = p[nump - 1]; + Interpolants prevInterp = interp[nump - 1]; + float prevCoord = prev.select(AXIS); + // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and + // if so, remember which side it is outside of. In the special case that W is + // negative and |C| < |W|, both -W <= C and C <= W will be false, such that + // we must consider the coordinate as falling outside of both plane sides + // simultaneously. We test each condition separately and combine them to form + // a mask of which plane sides we exceeded. If we neglect to consider both + // sides simultaneously, points can erroneously oscillate from one plane side + // to the other and exceed the supported maximum number of clip outputs. + int prevMask = (prevCoord < -prev.w ? NEGATIVE : 0) | + (prevCoord > prev.w ? POSITIVE : 0); + // Loop through points, finding edges that cross the planes by evaluating + // the side at each point. + outEdgeMask = 0; + for (int i = 0; i < nump; i++, edgeMask >>= 1) { + Point3D cur = p[i]; + Interpolants curInterp = interp[i]; + float curCoord = cur.select(AXIS); + int curMask = + (curCoord < -cur.w ? NEGATIVE : 0) | (curCoord > cur.w ? POSITIVE : 0); + // Check if the previous and current end points are on different sides. If + // the masks of sides intersect, then we consider them to be on the same + // side. So in the case the masks do not intersect, we then consider them + // to fall on different sides. + if (!(curMask & prevMask)) { + // One of the edge's end points is outside the plane with the other + // inside the plane. Find the offset where it crosses the plane and + // adjust the point and interpolants to there. + if (prevMask) { + // Edge that was previously outside crosses inside. + // Evaluate plane equation for previous and current end-point + // based on previous side and calculate relative offset. + if (numClip >= nump + 2) { + // If for some reason we produced more vertexes than we support, just + // bail out. + assert(false); + return 0; + } + // The positive plane is assigned the sign 1, and the negative plane is + // assigned -1. If the point falls outside both planes, that means W is + // negative. To compensate for this, we must interpolate the coordinate + // till W=0, at which point we can choose a single plane side for the + // coordinate to fall on since W will no longer be negative. To compute + // the coordinate where W=0, we compute K = prev.w / (prev.w-cur.w) and + // interpolate C = prev.C + K*(cur.C - prev.C). The sign of C will be + // the side of the plane we need to consider. Substituting K into the + // comparison C < 0, we can then avoid the division in K with a + // cross-multiplication. + float prevSide = + (prevMask & NEGATIVE) && (!(prevMask & POSITIVE) || + prevCoord * (cur.w - prev.w) < + prev.w * (curCoord - prevCoord)) + ? -1 + : 1; + float prevDist = prevCoord - prevSide * prev.w; + float curDist = curCoord - prevSide * cur.w; + // It may happen that after we interpolate by the weight k that due to + // floating point rounding we've underestimated the value necessary to + // push it over the clipping boundary. Just in case, nudge the mantissa + // by a single increment so that we essentially round it up and move it + // further inside the clipping boundary. We use nextafter to do this in + // a portable fashion. + float k = prevDist / (prevDist - curDist); + Point3D clipped = prev + (cur - prev) * k; + if (prevSide * clipped.select(AXIS) > clipped.w) { + k = nextafterf(k, 1.0f); + clipped = prev + (cur - prev) * k; + } + outP[numClip] = clipped; + outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; + // Don't output the current edge mask since start point was outside. + numClip++; + } + if (curMask) { + // Edge that was previously inside crosses outside. + // Evaluate plane equation for previous and current end-point + // based on current side and calculate relative offset. + if (numClip >= nump + 2) { + assert(false); + return 0; + } + // In the case the coordinate falls on both plane sides, the computation + // here is much the same as for prevSide, but since we are going from a + // previous W that is positive to current W that is negative, then the + // sign of cur.w - prev.w will flip in the equation. The resulting sign + // is negated to compensate for this. + float curSide = + (curMask & POSITIVE) && (!(curMask & NEGATIVE) || + prevCoord * (cur.w - prev.w) < + prev.w * (curCoord - prevCoord)) + ? 1 + : -1; + float prevDist = prevCoord - curSide * prev.w; + float curDist = curCoord - curSide * cur.w; + // Calculate interpolation weight k and the nudge it inside clipping + // boundary with nextafter. Note that since we were previously inside + // and now crossing outside, we have to flip the nudge direction for + // the weight towards 0 instead of 1. + float k = prevDist / (prevDist - curDist); + Point3D clipped = prev + (cur - prev) * k; + if (curSide * clipped.select(AXIS) > clipped.w) { + k = nextafterf(k, 0.0f); + clipped = prev + (cur - prev) * k; + } + outP[numClip] = clipped; + outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; + // Output the current edge mask since the end point is inside. + outEdgeMask |= (edgeMask & 1) << numClip; + numClip++; + } + } + if (!curMask) { + // The current end point is inside the plane, so output point unmodified. + if (numClip >= nump + 2) { + assert(false); + return 0; + } + outP[numClip] = cur; + outInterp[numClip] = curInterp; + // Output the current edge mask since the end point is inside. + outEdgeMask |= (edgeMask & 1) << numClip; + numClip++; + } + prev = cur; + prevInterp = curInterp; + prevCoord = curCoord; + prevMask = curMask; + } + return numClip; +} + +// Helper function to dispatch to perspective span drawing with points that +// have already been transformed and clipped. +static inline void draw_perspective_clipped(int nump, Point3D* p_clip, + Interpolants* interp_clip, + Texture& colortex, + Texture& depthtex) { + // If polygon is ouside clip rect, nothing to draw. + ClipRect clipRect(colortex); + if (!clipRect.overlaps(nump, p_clip)) { + return; + } + + // Finally draw perspective-correct spans for the polygon. + if (colortex.internal_format == GL_RGBA8) { + draw_perspective_spans(nump, p_clip, interp_clip, colortex, + depthtex, clipRect); + } else if (colortex.internal_format == GL_R8) { + draw_perspective_spans(nump, p_clip, interp_clip, colortex, + depthtex, clipRect); + } else { + assert(false); + } +} + +// Draws a perspective-correct 3D primitive with varying Z value, as opposed +// to a simple 2D planar primitive with a constant Z value that could be +// trivially Z rejected. This requires clipping the primitive against the near +// and far planes to ensure it stays within the valid Z-buffer range. The Z +// and W of each fragment of the primitives are interpolated across the +// generated spans and then depth-tested as appropriate. +// Additionally, vertex attributes must be interpolated with perspective- +// correction by dividing by W before interpolation, and then later multiplied +// by W again to produce the final correct attribute value for each fragment. +// This process is expensive and should be avoided if possible for primitive +// batches that are known ahead of time to not need perspective-correction. +static void draw_perspective(int nump, Interpolants interp_outs[4], + Texture& colortex, Texture& depthtex) { + // Lines are not supported with perspective. + assert(nump >= 3); + // Convert output of vertex shader to screen space. + vec4 pos = vertex_shader->gl_Position; + vec3_scalar scale = + vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f; + vec3_scalar offset = + make_vec3(make_vec2(ctx->viewport.origin() - colortex.offset), 0.0f) + + scale; + // Verify if point is between near and far planes, rejecting NaN. + if (test_all(pos.z > -pos.w && pos.z < pos.w)) { + // No points cross the near or far planes, so no clipping required. + // Just divide coords by W and convert to viewport. We assume the W + // coordinate is non-zero and the reciprocal is finite since it would + // otherwise fail the test_none condition. + Float w = 1.0f / pos.w; + vec3 screen = pos.sel(X, Y, Z) * w * scale + offset; + Point3D p[4] = {{screen.x.x, screen.y.x, screen.z.x, w.x}, + {screen.x.y, screen.y.y, screen.z.y, w.y}, + {screen.x.z, screen.y.z, screen.z.z, w.z}, + {screen.x.w, screen.y.w, screen.z.w, w.w}}; + draw_perspective_clipped(nump, p, interp_outs, colortex, depthtex); + } else { + // Points cross the near or far planes, so we need to clip. + // Start with the original 3 or 4 points... + Point3D p[4] = {{pos.x.x, pos.y.x, pos.z.x, pos.w.x}, + {pos.x.y, pos.y.y, pos.z.y, pos.w.y}, + {pos.x.z, pos.y.z, pos.z.z, pos.w.z}, + {pos.x.w, pos.y.w, pos.z.w, pos.w.w}}; + // Clipping can expand the points by 1 for each of 6 view frustum planes. + Point3D p_clip[4 + 6]; + Interpolants interp_clip[4 + 6]; + // Clip against near and far Z planes. + nump = clip_side(nump, p, interp_outs, p_clip, interp_clip, + swgl_AAEdgeMask); + // If no points are left inside the view frustum, there's nothing to draw. + if (nump < 3) { + return; + } + // After clipping against only the near and far planes, we might still + // produce points where W = 0, exactly at the camera plane. OpenGL specifies + // that for clip coordinates, points must satisfy: + // -W <= X <= W + // -W <= Y <= W + // -W <= Z <= W + // When Z = W = 0, this is trivially satisfied, but when we transform and + // divide by W below it will produce a divide by 0. Usually we want to only + // clip Z to avoid the extra work of clipping X and Y. We can still project + // points that fall outside the view frustum X and Y so long as Z is valid. + // The span drawing code will then ensure X and Y are clamped to viewport + // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y, + // will push W further inside the view frustum so that it is no longer 0, + // allowing us to finally proceed to projecting the points to the screen. + for (int i = 0; i < nump; i++) { + // Found an invalid W, so need to clip against X and Y... + if (p_clip[i].w <= 0.0f) { + // Ping-pong p_clip -> p_tmp -> p_clip. + Point3D p_tmp[4 + 6]; + Interpolants interp_tmp[4 + 6]; + nump = clip_side(nump, p_clip, interp_clip, p_tmp, interp_tmp, + swgl_AAEdgeMask); + if (nump < 3) return; + nump = clip_side(nump, p_tmp, interp_tmp, p_clip, interp_clip, + swgl_AAEdgeMask); + if (nump < 3) return; + // After clipping against X and Y planes, there's still points left + // to draw, so proceed to trying projection now... + break; + } + } + // Divide coords by W and convert to viewport. + for (int i = 0; i < nump; i++) { + float w = 1.0f / p_clip[i].w; + // If the W coord is essentially zero, small enough that division would + // result in Inf/NaN, then just set the point to all zeroes, as the only + // point that satisfies -W <= X/Y/Z <= W is all zeroes. + p_clip[i] = isfinite(w) + ? Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w) + : Point3D(0.0f); + } + draw_perspective_clipped(nump, p_clip, interp_clip, colortex, depthtex); + } +} + +static void draw_quad(int nump, Texture& colortex, Texture& depthtex) { + // Run vertex shader once for the primitive's vertices. + // Reserve space for 6 sets of interpolants, in case we need to clip against + // near and far planes in the perspective case. + Interpolants interp_outs[4]; + swgl_ClipFlags = 0; + vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants)); + vec4 pos = vertex_shader->gl_Position; + // Check if any vertex W is different from another. If so, use perspective. + if (test_any(pos.w != pos.w.x)) { + draw_perspective(nump, interp_outs, colortex, depthtex); + return; + } + + // Convert output of vertex shader to screen space. + // Divide coords by W and convert to viewport. + float w = 1.0f / pos.w.x; + // If the W coord is essentially zero, small enough that division would + // result in Inf/NaN, then just set the reciprocal itself to zero so that + // the coordinates becomes zeroed out, as the only valid point that + // satisfies -W <= X/Y/Z <= W is all zeroes. + if (!isfinite(w)) w = 0.0f; + vec2 screen = (pos.sel(X, Y) * w + 1) * 0.5f * + vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) + + make_vec2(ctx->viewport.origin() - colortex.offset); + Point2D p[4] = {{screen.x.x, screen.y.x}, + {screen.x.y, screen.y.y}, + {screen.x.z, screen.y.z}, + {screen.x.w, screen.y.w}}; + + // If quad is ouside clip rect, nothing to draw. + ClipRect clipRect(colortex); + if (!clipRect.overlaps(nump, p)) { + return; + } + + // Since the quad is assumed 2D, Z is constant across the quad. + float screenZ = (pos.z.x * w + 1) * 0.5f; + if (screenZ < 0 || screenZ > 1) { + // Z values would cross the near or far plane, so just bail. + return; + } + // Since Z doesn't need to be interpolated, just set the fragment shader's + // Z and W values here, once and for all fragment shader invocations. + uint32_t z = uint32_t(MAX_DEPTH_VALUE * screenZ); + fragment_shader->gl_FragCoord.z = screenZ; + fragment_shader->gl_FragCoord.w = w; + + // If supplied a line, adjust it so that it is a quad at least 1 pixel thick. + // Assume that for a line that all 4 SIMD lanes were actually filled with + // vertexes 0, 1, 1, 0. + if (nump == 2) { + // Nudge Y height to span at least 1 pixel by advancing to next pixel + // boundary so that we step at least 1 row when drawing spans. + if (int(p[0].y + 0.5f) == int(p[1].y + 0.5f)) { + p[2].y = 1 + int(p[1].y + 0.5f); + p[3].y = p[2].y; + // Nudge X width to span at least 1 pixel so that rounded coords fall on + // separate pixels. + if (int(p[0].x + 0.5f) == int(p[1].x + 0.5f)) { + p[1].x += 1.0f; + p[2].x += 1.0f; + } + } else { + // If the line already spans at least 1 row, then assume line is vertical + // or diagonal and just needs to be dilated horizontally. + p[2].x += 1.0f; + p[3].x += 1.0f; + } + // Pretend that it's a quad now... + nump = 4; + } + + // Finally draw 2D spans for the quad. Currently only supports drawing to + // RGBA8 and R8 color buffers. + if (colortex.internal_format == GL_RGBA8) { + draw_quad_spans(nump, p, z, interp_outs, colortex, depthtex, + clipRect); + } else if (colortex.internal_format == GL_R8) { + draw_quad_spans(nump, p, z, interp_outs, colortex, depthtex, + clipRect); + } else { + assert(false); + } +} + +template +static inline void draw_elements(GLsizei count, GLsizei instancecount, + size_t offset, VertexArray& v, + Texture& colortex, Texture& depthtex) { + Buffer& indices_buf = ctx->buffers[v.element_array_buffer_binding]; + if (!indices_buf.buf || offset >= indices_buf.size) { + return; + } + assert((offset & (sizeof(INDEX) - 1)) == 0); + INDEX* indices = (INDEX*)(indices_buf.buf + offset); + count = min(count, (GLsizei)((indices_buf.size - offset) / sizeof(INDEX))); + // Triangles must be indexed at offsets 0, 1, 2. + // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3. + if (count == 6 && indices[1] == indices[0] + 1 && + indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) { + assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1); + // Fast path - since there is only a single quad, we only load per-vertex + // attribs once for all instances, as they won't change across instances + // or within an instance. + vertex_shader->load_attribs(v.attribs, indices[0], 0, 4); + draw_quad(4, colortex, depthtex); + for (GLsizei instance = 1; instance < instancecount; instance++) { + vertex_shader->load_attribs(v.attribs, indices[0], instance, 0); + draw_quad(4, colortex, depthtex); + } + } else { + for (GLsizei instance = 0; instance < instancecount; instance++) { + for (GLsizei i = 0; i + 3 <= count; i += 3) { + if (indices[i + 1] != indices[i] + 1 || + indices[i + 2] != indices[i] + 2) { + continue; + } + if (i + 6 <= count && indices[i + 5] == indices[i] + 3) { + assert(indices[i + 3] == indices[i] + 2 && + indices[i + 4] == indices[i] + 1); + vertex_shader->load_attribs(v.attribs, indices[i], instance, 4); + draw_quad(4, colortex, depthtex); + i += 3; + } else { + vertex_shader->load_attribs(v.attribs, indices[i], instance, 3); + draw_quad(3, colortex, depthtex); + } + } + } + } +} diff --git a/gfx/wr/swgl/src/swgl_ext.h b/gfx/wr/swgl/src/swgl_ext.h new file mode 100644 index 0000000000..e892981f4c --- /dev/null +++ b/gfx/wr/swgl/src/swgl_ext.h @@ -0,0 +1,1924 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// When using a solid color with clip masking, the cost of loading the clip mask +// in the blend stage exceeds the cost of processing the color. Here we handle +// the entire span of clip mask texture before the blend stage to more +// efficiently process it and modulate it with color without incurring blend +// stage overheads. +template +static void commit_masked_solid_span(P* buf, C color, int len) { + override_clip_mask(); + uint8_t* mask = get_clip_mask(buf); + for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) { + commit_span( + buf, + blend_span( + buf, + applyColor(expand_mask(buf, unpack(unaligned_load(mask))), + color))); + } + restore_clip_mask(); +} + +// When using a solid color with anti-aliasing, most of the solid span will not +// benefit from anti-aliasing in the opaque region. We only want to apply the AA +// blend stage in the non-opaque start and end of the span where AA is needed. +template +static ALWAYS_INLINE void commit_aa_solid_span(P* buf, R r, int len) { + if (int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) { + commit_solid_span(buf, r, start); + buf += start; + len -= start; + } + if (int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) { + override_aa(); + commit_solid_span(buf, r, opaque); + restore_aa(); + buf += opaque; + len -= opaque; + } + if (len > 0) { + commit_solid_span(buf, r, len); + } +} + +// Forces a value with vector run-class to have scalar run-class. +template +static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) { + return force_scalar(v); +} + +// Advance all varying inperpolants by a single chunk +#define swgl_stepInterp() step_interp_inputs() + +// Pseudo-intrinsic that accesses the interpolation step for a given varying +#define swgl_interpStep(v) (interp_step.v) + +// Commit an entire span of a solid color. This dispatches to clip-masked and +// anti-aliased fast-paths as appropriate. +#define swgl_commitSolid(format, v, n) \ + do { \ + int len = (n); \ + if (blend_key) { \ + if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { \ + commit_masked_solid_span(swgl_Out##format, \ + packColor(swgl_Out##format, (v)), len); \ + } else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { \ + commit_aa_solid_span(swgl_Out##format, \ + pack_span(swgl_Out##format, (v)), len); \ + } else { \ + commit_solid_span(swgl_Out##format, \ + pack_span(swgl_Out##format, (v)), len); \ + } \ + } else { \ + commit_solid_span(swgl_Out##format, \ + pack_span(swgl_Out##format, (v)), len); \ + } \ + swgl_Out##format += len; \ + swgl_SpanLength -= len; \ + } while (0) +#define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength) +#define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength) +#define swgl_commitPartialSolidRGBA8(len, v) \ + swgl_commitSolid(RGBA8, v, min(int(len), swgl_SpanLength)) +#define swgl_commitPartialSolidR8(len, v) \ + swgl_commitSolid(R8, v, min(int(len), swgl_SpanLength)) + +#define swgl_commitChunk(format, chunk) \ + do { \ + auto r = chunk; \ + if (blend_key) r = blend_span(swgl_Out##format, r); \ + commit_span(swgl_Out##format, r); \ + swgl_Out##format += swgl_StepSize; \ + swgl_SpanLength -= swgl_StepSize; \ + } while (0) + +// Commit a single chunk of a color +#define swgl_commitColor(format, color) \ + swgl_commitChunk(format, pack_pixels_##format(color)) +#define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color) +#define swgl_commitColorR8(color) swgl_commitColor(R8, color) + +template +static ALWAYS_INLINE bool swgl_isTextureLinear(S s) { + return s->filter == TextureFilter::LINEAR; +} + +template +static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) { + return s->format == TextureFormat::RGBA8; +} + +template +static ALWAYS_INLINE bool swgl_isTextureR8(S s) { + return s->format == TextureFormat::R8; +} + +// Use the default linear quantization scale of 128. This gives 7 bits of +// fractional precision, which when multiplied with a signed 9 bit value +// still fits in a 16 bit integer. +const int swgl_LinearQuantizeScale = 128; + +// Quantizes UVs for access into a linear texture. +template +static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) { + return linearQuantize(p, swgl_LinearQuantizeScale, s); +} + +// Quantizes an interpolation step for UVs for access into a linear texture. +template +static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) { + return samplerScale(s, p) * swgl_LinearQuantizeScale; +} + +template +static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf, + S sampler, ivec2 i) { + return textureLinearUnpackedRGBA8(sampler, i); +} + +template +static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf, + S sampler, ivec2 i) { + return textureLinearUnpackedR8(sampler, i); +} + +template +static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint32_t* buf) { + return swgl_isTextureRGBA8(s); +} + +template +static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) { + return swgl_isTextureR8(s); +} + +// Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets +// for linear sampling. +#define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv) \ + uv = swgl_linearQuantize(sampler, uv); \ + vec2_scalar uv_step = \ + float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; \ + vec2_scalar min_uv = max( \ + swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \ + vec2_scalar max_uv = \ + max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), \ + min_uv); + +// Implements the fallback linear filter that can deal with clamping and +// arbitrary scales. +template +static P* blendTextureLinearFallback(S sampler, vec2 uv, int span, + vec2_scalar uv_step, vec2_scalar min_uv, + vec2_scalar max_uv, C color, P* buf) { + for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { + commit_blend_span( + buf, applyColor(textureLinearUnpacked(buf, sampler, + ivec2(clamp(uv, min_uv, max_uv))), + color)); + } + return buf; +} + +static ALWAYS_INLINE U64 castForShuffle(V16 r) { + return bit_cast(r); +} +static ALWAYS_INLINE U16 castForShuffle(V4 r) { + return bit_cast(r); +} + +static ALWAYS_INLINE V16 applyFracX(V16 r, I16 fracx) { + return r * fracx.xxxxyyyyzzzzwwww; +} +static ALWAYS_INLINE V4 applyFracX(V4 r, I16 fracx) { + return r * fracx; +} + +// Implements a faster linear filter that works with axis-aligned constant Y but +// scales less than 1, i.e. upscaling. In this case we can optimize for the +// constant Y fraction as well as load all chunks from memory in a single tap +// for each row. +template +static void blendTextureLinearUpscale(S sampler, vec2 uv, int span, + vec2_scalar uv_step, vec2_scalar min_uv, + vec2_scalar max_uv, C color, P* buf) { + typedef VectorType packed_type; + typedef VectorType unpacked_type; + typedef VectorType signed_unpacked_type; + + ivec2 i(clamp(uv, min_uv, max_uv)); + ivec2 frac = i; + i >>= 7; + P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x)); + P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x)); + I16 fracx = computeFracX(sampler, i, frac); + int16_t fracy = computeFracY(frac).x; + auto src0 = + CONVERT(unaligned_load(&row0[i.x.x]), signed_unpacked_type); + auto src1 = + CONVERT(unaligned_load(&row1[i.x.x]), signed_unpacked_type); + auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); + + // We attempt to sample ahead by one chunk and interpolate it with the current + // one. However, due to the complication of upscaling, we may not necessarily + // shift in all the next set of samples. + for (P* end = buf + span; buf < end; buf += 4) { + uv.x += uv_step.x; + I32 ixn = cast(uv.x); + I16 fracn = computeFracNoClamp(ixn); + ixn >>= 7; + auto src0n = CONVERT(unaligned_load(&row0[ixn.x]), + signed_unpacked_type); + auto src1n = CONVERT(unaligned_load(&row1[ixn.x]), + signed_unpacked_type); + auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); + + // Since we're upscaling, we know that a source pixel has a larger footprint + // than the destination pixel, and thus all the source pixels needed for + // this chunk will fall within a single chunk of texture data. However, + // since the source pixels don't map 1:1 with destination pixels, we need to + // shift the source pixels over based on their offset from the start of the + // chunk. This could conceivably be optimized better with usage of PSHUFB or + // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort + // to masking in the correct pixels to avoid having to index into memory. + // For the last sample to interpolate with, we need to potentially shift in + // a sample from the next chunk over in the case the samples fill out an + // entire chunk. + auto shuf = src; + auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4); + if (i.x.y == i.x.x) { + shuf = shuf.xxyz; + shufn = shufn.xxyz; + } + if (i.x.z == i.x.y) { + shuf = shuf.xyyz; + shufn = shufn.xyyz; + } + if (i.x.w == i.x.z) { + shuf = shuf.xyzz; + shufn = shufn.xyzz; + } + + // Convert back to a signed unpacked type so that we can interpolate the + // final result. + auto interp = bit_cast(shuf); + auto interpn = bit_cast(shufn); + interp += applyFracX(interpn - interp, fracx) >> 7; + + commit_blend_span( + buf, applyColor(bit_cast(interp), color)); + + i.x = ixn; + fracx = fracn; + src = srcn; + } +} + +// This is the fastest variant of the linear filter that still provides +// filtering. In cases where there is no scaling required, but we have a +// subpixel offset that forces us to blend in neighboring pixels, we can +// optimize away most of the memory loads and shuffling that is required by the +// fallback filter. +template +static void blendTextureLinearFast(S sampler, vec2 uv, int span, + vec2_scalar min_uv, vec2_scalar max_uv, + C color, P* buf) { + typedef VectorType packed_type; + typedef VectorType unpacked_type; + typedef VectorType signed_unpacked_type; + + ivec2 i(clamp(uv, min_uv, max_uv)); + ivec2 frac = i; + i >>= 7; + P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); + P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); + int16_t fracx = computeFracX(sampler, i, frac).x; + int16_t fracy = computeFracY(frac).x; + auto src0 = CONVERT(unaligned_load(row0), signed_unpacked_type); + auto src1 = CONVERT(unaligned_load(row1), signed_unpacked_type); + auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); + + // Since there is no scaling, we sample ahead by one chunk and interpolate it + // with the current one. We can then reuse this value on the next iteration. + for (P* end = buf + span; buf < end; buf += 4) { + row0 += 4; + row1 += 4; + auto src0n = + CONVERT(unaligned_load(row0), signed_unpacked_type); + auto src1n = + CONVERT(unaligned_load(row1), signed_unpacked_type); + auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); + + // For the last sample to interpolate with, we need to potentially shift in + // a sample from the next chunk over since the samples fill out an entire + // chunk. + auto interp = bit_cast(src); + auto interpn = + bit_cast(SHUFFLE(src, srcn, 1, 2, 3, 4)); + interp += ((interpn - interp) * fracx) >> 7; + + commit_blend_span( + buf, applyColor(bit_cast(interp), color)); + + src = srcn; + } +} + +// Implements a faster linear filter that works with axis-aligned constant Y but +// downscaling the texture by half. In this case we can optimize for the +// constant X/Y fractions and reduction factor while minimizing shuffling. +template +static NO_INLINE void blendTextureLinearDownscale(S sampler, vec2 uv, int span, + vec2_scalar min_uv, + vec2_scalar max_uv, C color, + P* buf) { + typedef VectorType packed_type; + typedef VectorType unpacked_type; + typedef VectorType signed_unpacked_type; + + ivec2 i(clamp(uv, min_uv, max_uv)); + ivec2 frac = i; + i >>= 7; + P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); + P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); + int16_t fracx = computeFracX(sampler, i, frac).x; + int16_t fracy = computeFracY(frac).x; + + for (P* end = buf + span; buf < end; buf += 4) { + auto src0 = + CONVERT(unaligned_load(row0), signed_unpacked_type); + auto src1 = + CONVERT(unaligned_load(row1), signed_unpacked_type); + auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); + row0 += 4; + row1 += 4; + auto src0n = + CONVERT(unaligned_load(row0), signed_unpacked_type); + auto src1n = + CONVERT(unaligned_load(row1), signed_unpacked_type); + auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); + row0 += 4; + row1 += 4; + + auto interp = + bit_cast(SHUFFLE(src, srcn, 0, 2, 4, 6)); + auto interpn = + bit_cast(SHUFFLE(src, srcn, 1, 3, 5, 7)); + interp += ((interpn - interp) * fracx) >> 7; + + commit_blend_span( + buf, applyColor(bit_cast(interp), color)); + } +} + +enum LinearFilter { + // No linear filter is needed. + LINEAR_FILTER_NEAREST = 0, + // The most general linear filter that handles clamping and varying scales. + LINEAR_FILTER_FALLBACK, + // A linear filter optimized for axis-aligned upscaling. + LINEAR_FILTER_UPSCALE, + // A linear filter with no scaling but with subpixel offset. + LINEAR_FILTER_FAST, + // A linear filter optimized for 2x axis-aligned downscaling. + LINEAR_FILTER_DOWNSCALE +}; + +// Dispatches to an appropriate linear filter depending on the selected filter. +template +static P* blendTextureLinearDispatch(S sampler, vec2 uv, int span, + vec2_scalar uv_step, vec2_scalar min_uv, + vec2_scalar max_uv, C color, P* buf, + LinearFilter filter) { + P* end = buf + span; + if (filter != LINEAR_FILTER_FALLBACK) { + // If we're not using the fallback, then Y is constant across the entire + // row. We just need to ensure that we handle any samples that might pull + // data from before the start of the row and require clamping. + float beforeDist = max(0.0f, min_uv.x) - uv.x.x; + if (beforeDist > 0) { + int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0, + int(end - buf)); + buf = blendTextureLinearFallback(sampler, uv, before, uv_step, + min_uv, max_uv, color, buf); + uv.x += (before / swgl_StepSize) * uv_step.x; + } + // We need to check how many samples we can take from inside the row without + // requiring clamping. In case the filter oversamples the row by a step, we + // subtract off a step from the width to leave some room. + float insideDist = + min(max_uv.x, float((int(sampler->width) - swgl_StepSize) * + swgl_LinearQuantizeScale)) - + uv.x.x; + if (uv_step.x > 0.0f && insideDist >= uv_step.x) { + int32_t inside = int(end - buf); + if (filter == LINEAR_FILTER_DOWNSCALE) { + inside = min(int(insideDist * (0.5f / swgl_LinearQuantizeScale)) & + ~(swgl_StepSize - 1), + inside); + if (inside > 0) { + blendTextureLinearDownscale(sampler, uv, inside, min_uv, + max_uv, color, buf); + buf += inside; + uv.x += (inside / swgl_StepSize) * uv_step.x; + } + } else if (filter == LINEAR_FILTER_UPSCALE) { + inside = min(int(insideDist / uv_step.x) * swgl_StepSize, inside); + if (inside > 0) { + blendTextureLinearUpscale(sampler, uv, inside, uv_step, min_uv, + max_uv, color, buf); + buf += inside; + uv.x += (inside / swgl_StepSize) * uv_step.x; + } + } else { + inside = min(int(insideDist * (1.0f / swgl_LinearQuantizeScale)) & + ~(swgl_StepSize - 1), + inside); + if (inside > 0) { + blendTextureLinearFast(sampler, uv, inside, min_uv, max_uv, + color, buf); + buf += inside; + uv.x += (inside / swgl_StepSize) * uv_step.x; + } + } + } + } + // If the fallback filter was requested, or if there are any samples left that + // may be outside the row and require clamping, then handle that with here. + if (buf < end) { + buf = blendTextureLinearFallback( + sampler, uv, int(end - buf), uv_step, min_uv, max_uv, color, buf); + } + return buf; +} + +// Helper function to quantize UVs for linear filtering before dispatch +template +static inline int blendTextureLinear(S sampler, vec2 uv, int span, + const vec4_scalar& uv_rect, C color, + P* buf, LinearFilter filter) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); + blendTextureLinearDispatch(sampler, uv, span, uv_step, min_uv, max_uv, + color, buf, filter); + return span; +} + +// Samples an axis-aligned span of on a single row of a texture using 1:1 +// nearest filtering. Sampling is constrained to only fall within the given UV +// bounds. This requires a pointer to the destination buffer. An optional color +// modulus can be supplied. +template +static int blendTextureNearestFast(S sampler, vec2 uv, int span, + const vec4_scalar& uv_rect, C color, + P* buf) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + + typedef VectorType packed_type; + + ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv))); + ivec2_scalar minUV = + make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y})); + ivec2_scalar maxUV = + make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w})); + + // Calculate the row pointer within the buffer, clamping to within valid row + // bounds. + P* row = + &((P*)sampler + ->buf)[clampCoord(clamp(i.y, minUV.y, maxUV.y), sampler->height) * + sampler->stride]; + // Find clamped X bounds within the row. + int minX = clamp(minUV.x, 0, sampler->width - 1); + int maxX = clamp(maxUV.x, minX, sampler->width - 1); + int curX = i.x; + int endX = i.x + span; + // If we need to start sampling below the valid sample bounds, then we need to + // fill this section with a constant clamped sample. + if (curX < minX) { + int n = min(minX, endX) - curX; + auto src = + applyColor(unpack(bit_cast(V4

(row[minX]))), color); + commit_solid_span(buf, src, n); + buf += n; + curX += n; + } + // Here we only deal with valid samples within the sample bounds. No clamping + // should occur here within these inner loops. + int n = max(min(maxX + 1, endX) - curX, 0); + // Try to process as many chunks as possible with full loads and stores. + for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) { + auto src = applyColor(unaligned_load(&row[curX]), color); + commit_blend_span(buf, src); + } + n &= 3; + // If we have any leftover samples after processing chunks, use partial loads + // and stores. + if (n > 0) { + auto src = applyColor(partial_load_span(&row[curX], n), color); + commit_blend_span(buf, src, n); + buf += n; + curX += n; + } + // If we still have samples left above the valid sample bounds, then we again + // need to fill this section with a constant clamped sample. + if (curX < endX) { + auto src = + applyColor(unpack(bit_cast(V4

(row[maxX]))), color); + commit_solid_span(buf, src, endX - curX); + } + return span; +} + +// We need to verify that the pixel step reasonably approximates stepping by a +// single texel for every pixel we need to reproduce. Try to ensure that the +// margin of error is no more than approximately 2^-7. Also, we check here if +// the scaling can be quantized for acceleration. +template +static ALWAYS_INLINE int spanNeedsScale(int span, T P) { + span &= ~(128 - 1); + span += 128; + int scaled = round((P.x.y - P.x.x) * span); + return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0; +} + +// Helper function to decide whether we can safely apply 1:1 nearest filtering +// without diverging too much from the linear filter. +template +static inline LinearFilter needsTextureLinear(S sampler, T P, int span) { + // If each row is not wide enough for linear filtering, then just use nearest + // filtering. + if (sampler->width < 2) { + return LINEAR_FILTER_NEAREST; + } + // First verify if the row Y doesn't change across samples + if (P.y.x != P.y.y) { + return LINEAR_FILTER_FALLBACK; + } + P = samplerScale(sampler, P); + if (int scale = spanNeedsScale(span, P)) { + // If the source region is not flipped and smaller than the destination, + // then we can use the upscaling filter since row Y is constant. + return P.x.x < P.x.y && P.x.y - P.x.x <= 1 + ? LINEAR_FILTER_UPSCALE + : (scale == 2 ? LINEAR_FILTER_DOWNSCALE + : LINEAR_FILTER_FALLBACK); + } + // Also verify that we're reasonably close to the center of a texel + // so that it doesn't look that much different than if a linear filter + // was used. + if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 || + (int(P.y.x * 4.0f + 0.5f) & 3) != 2) { + // The source and destination regions are the same, but there is a + // significant subpixel offset. We can use a faster linear filter to deal + // with the offset in this case. + return LINEAR_FILTER_FAST; + } + // Otherwise, we have a constant 1:1 step and we're stepping reasonably close + // to the center of each pixel, so it's safe to disable the linear filter and + // use nearest. + return LINEAR_FILTER_NEAREST; +} + +// Commit an entire span with linear filtering +#define swgl_commitTextureLinear(format, s, p, uv_rect, color, n) \ + do { \ + auto packed_color = packColor(swgl_Out##format, color); \ + int len = (n); \ + int drawn = 0; \ + if (LinearFilter filter = needsTextureLinear(s, p, len)) { \ + if (blend_key) { \ + drawn = blendTextureLinear(s, p, len, uv_rect, packed_color, \ + swgl_Out##format, filter); \ + } else { \ + drawn = blendTextureLinear(s, p, len, uv_rect, packed_color, \ + swgl_Out##format, filter); \ + } \ + } else if (blend_key) { \ + drawn = blendTextureNearestFast(s, p, len, uv_rect, packed_color, \ + swgl_Out##format); \ + } else { \ + drawn = blendTextureNearestFast(s, p, len, uv_rect, packed_color, \ + swgl_Out##format); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \ + swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength) +#define swgl_commitTextureLinearR8(s, p, uv_rect) \ + swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength) + +// Commit a partial span with linear filtering, optionally inverting the color +#define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \ + swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), \ + min(int(len), swgl_SpanLength)) +#define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \ + swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(), \ + min(int(len), swgl_SpanLength)) + +// Commit an entire span with linear filtering that is scaled by a color +#define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \ + swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength) +#define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \ + swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength) + +// Helper function that samples from an R8 texture while expanding it to support +// a differing framebuffer format. +template +static inline int blendTextureLinearR8(S sampler, vec2 uv, int span, + const vec4_scalar& uv_rect, C color, + P* buf) { + if (!swgl_isTextureR8(sampler) || sampler->width < 2) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); + for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { + commit_blend_span( + buf, applyColor(expand_mask(buf, textureLinearUnpackedR8( + sampler, + ivec2(clamp(uv, min_uv, max_uv)))), + color)); + } + return span; +} + +// Commit an entire span with linear filtering while expanding from R8 to RGBA8 +#define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color) \ + do { \ + auto packed_color = packColor(swgl_OutRGBA8, color); \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendTextureLinearR8(s, p, swgl_SpanLength, uv_rect, \ + packed_color, swgl_OutRGBA8); \ + } else { \ + drawn = blendTextureLinearR8(s, p, swgl_SpanLength, uv_rect, \ + packed_color, swgl_OutRGBA8); \ + } \ + swgl_OutRGBA8 += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \ + swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor()) + +// Compute repeating UVs, possibly constrained by tile repeat limits +static inline vec2 tileRepeatUV(vec2 uv, const vec2_scalar& tile_repeat) { + if (tile_repeat.x > 0.0f) { + // Clamp to a number slightly less than the tile repeat limit so that + // it results in a number close to but not equal to 1 after fract(). + // This avoids fract() yielding 0 if the limit was left as whole integer. + uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f); + } + return fract(uv); +} + +// Compute the number of non-repeating steps before we need to potentially +// repeat the UVs. +static inline int computeNoRepeatSteps(Float uv, float uv_step, + float tile_repeat, int steps) { + if (uv.w < uv.x) { + // Ensure the UV taps are ordered low to high. + uv = uv.wzyx; + } + // Check if the samples cross the boundary of the next whole integer or the + // tile repeat limit, whichever is lower. + float limit = floor(uv.x) + 1.0f; + if (tile_repeat > 0.0f) { + limit = min(limit, tile_repeat); + } + return uv.x >= 0.0f && uv.w < limit + ? (uv_step != 0.0f + ? int(clamp((limit - uv.x) / uv_step, 0.0f, float(steps))) + : steps) + : 0; +} + +// Blends an entire span of texture with linear filtering and repeating UVs. +template +static int blendTextureLinearRepeat(S sampler, vec2 uv, int span, + const vec2_scalar& tile_repeat, + const vec4_scalar& uv_repeat, + const vec4_scalar& uv_rect, C color, + P* buf) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y}; + vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y}; + // Choose a linear filter to use for no-repeat sub-spans + LinearFilter filter = + needsTextureLinear(sampler, uv * uv_scale + uv_offset, span); + // We need to step UVs unscaled and unquantized so that we can modulo them + // with fract. We use uv_scale and uv_offset to map them into the correct + // range. + vec2_scalar uv_step = + float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; + uv_scale = swgl_linearQuantizeStep(sampler, uv_scale); + uv_offset = swgl_linearQuantize(sampler, uv_offset); + vec2_scalar min_uv = max( + swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); + vec2_scalar max_uv = max( + swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv); + for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { + int steps = int(end - buf) / swgl_StepSize; + // Find the sub-span before UVs repeat to avoid expensive repeat math + steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); + if (steps > 0) { + steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); + if (steps > 0) { + buf = blendTextureLinearDispatch( + sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize, + uv_step * uv_scale, min_uv, max_uv, color, buf, filter); + if (buf >= end) { + break; + } + uv += steps * uv_step; + } + } + // UVs might repeat within this step, so explicitly compute repeated UVs + vec2 repeated_uv = clamp( + tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv); + commit_blend_span( + buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)), + color)); + } + return span; +} + +// Commit an entire span with linear filtering and repeating UVs +#define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat, \ + uv_rect, color) \ + do { \ + auto packed_color = packColor(swgl_Out##format, color); \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendTextureLinearRepeat(s, p, swgl_SpanLength, \ + tile_repeat, uv_repeat, uv_rect, \ + packed_color, swgl_Out##format); \ + } else { \ + drawn = blendTextureLinearRepeat(s, p, swgl_SpanLength, \ + tile_repeat, uv_repeat, uv_rect, \ + packed_color, swgl_Out##format); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ + uv_rect) \ + swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ + NoColor()) +#define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \ + uv_rect, color) \ + swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ + color) + +template +static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf, + S sampler, ivec2 i) { + return textureNearestPackedRGBA8(sampler, i); +} + +// Blends an entire span of texture with nearest filtering and either +// repeated or clamped UVs. +template +static int blendTextureNearestRepeat(S sampler, vec2 uv, int span, + const vec2_scalar& tile_repeat, + const vec4_scalar& uv_rect, C color, + P* buf) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + if (!REPEAT) { + // If clamping, then we step pre-scaled to the sampler. For repeat modes, + // this will be accomplished via uv_scale instead. + uv = samplerScale(sampler, uv); + } + vec2_scalar uv_step = + float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; + vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}); + vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}); + vec2_scalar uv_scale = max_uv - min_uv; + // If the effective sampling area of this texture is only a single pixel, then + // treat it as a solid span. For repeat modes, the bounds are specified on + // pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so + // the test varies depending on which. If the sample range on an axis is + // greater than one pixel, we can still check if we don't move far enough from + // the pixel center on that axis to hit the next pixel. + if ((int(min_uv.x) + (REPEAT ? 1 : 0) >= int(max_uv.x) || + (abs(uv_step.x) * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) && + (int(min_uv.y) + (REPEAT ? 1 : 0) >= int(max_uv.y) || + (abs(uv_step.y) * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) { + vec2 repeated_uv = REPEAT + ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv + : clamp(uv, min_uv, max_uv); + commit_solid_span(buf, + applyColor(unpack(textureNearestPacked( + buf, sampler, ivec2(repeated_uv))), + color), + span); + } else { + for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { + if (REPEAT) { + int steps = int(end - buf) / swgl_StepSize; + // Find the sub-span before UVs repeat to avoid expensive repeat math + steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); + if (steps > 0) { + steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); + if (steps > 0) { + vec2 inside_uv = fract(uv) * uv_scale + min_uv; + vec2 inside_step = uv_step * uv_scale; + for (P* outside = &buf[steps * swgl_StepSize]; buf < outside; + buf += swgl_StepSize, inside_uv += inside_step) { + commit_blend_span( + buf, applyColor( + textureNearestPacked(buf, sampler, ivec2(inside_uv)), + color)); + } + if (buf >= end) { + break; + } + uv += steps * uv_step; + } + } + } + + // UVs might repeat within this step, so explicitly compute repeated UVs + vec2 repeated_uv = REPEAT + ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv + : clamp(uv, min_uv, max_uv); + commit_blend_span( + buf, + applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)), + color)); + } + } + return span; +} + +// Determine if we can use the fast nearest filter for the given nearest mode. +// If the Y coordinate varies more than half a pixel over +// the span (which might cause the texel to alias to the next one), or the span +// needs X scaling, then we have to use the fallback. +template +static ALWAYS_INLINE bool needsNearestFallback(S sampler, T P, int span) { + P = samplerScale(sampler, P); + return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P); +} + +// Commit an entire span with nearest filtering and either clamped or repeating +// UVs +#define swgl_commitTextureNearest(format, s, p, uv_rect, color) \ + do { \ + auto packed_color = packColor(swgl_Out##format, color); \ + int drawn = 0; \ + if (needsNearestFallback(s, p, swgl_SpanLength)) { \ + if (blend_key) { \ + drawn = blendTextureNearestRepeat( \ + s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ + swgl_Out##format); \ + } else { \ + drawn = blendTextureNearestRepeat( \ + s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ + swgl_Out##format); \ + } \ + } else if (blend_key) { \ + drawn = blendTextureNearestFast(s, p, swgl_SpanLength, uv_rect, \ + packed_color, swgl_Out##format); \ + } else { \ + drawn = blendTextureNearestFast(s, p, swgl_SpanLength, uv_rect, \ + packed_color, swgl_Out##format); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \ + swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor()) +#define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \ + swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color) + +#define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \ + color) \ + do { \ + auto packed_color = packColor(swgl_Out##format, color); \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendTextureNearestRepeat( \ + s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ + swgl_Out##format); \ + } else { \ + drawn = blendTextureNearestRepeat( \ + s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ + swgl_Out##format); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ + uv_rect) \ + swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, \ + NoColor()) +#define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat, \ + uv_repeat, uv_rect, color) \ + swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color) + +// Commit an entire span of texture with filtering determined by sampler state. +#define swgl_commitTexture(format, s, ...) \ + do { \ + if (s->filter == TextureFilter::LINEAR) { \ + swgl_commitTextureLinear##format(s, __VA_ARGS__); \ + } else { \ + swgl_commitTextureNearest##format(s, __VA_ARGS__); \ + } \ + } while (0) +#define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__) +#define swgl_commitTextureColorRGBA8(...) \ + swgl_commitTexture(ColorRGBA8, __VA_ARGS__) +#define swgl_commitTextureRepeatRGBA8(...) \ + swgl_commitTexture(RepeatRGBA8, __VA_ARGS__) +#define swgl_commitTextureRepeatColorRGBA8(...) \ + swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__) + +// Commit an entire span of a separable pass of a Gaussian blur that falls +// within the given radius scaled by supplied coefficients, clamped to uv_rect +// bounds. +template +static int blendGaussianBlur(S sampler, vec2 uv, const vec4_scalar& uv_rect, + P* buf, int span, bool hori, int radius, + vec2_scalar coeffs) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + vec2_scalar size = {float(sampler->width), float(sampler->height)}; + ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size); + ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size)); + int startX = curUV.x; + int endX = min(min(bounds.z, curUV.x + span), int(size.x)); + if (hori) { + for (; curUV.x + swgl_StepSize <= endX; + buf += swgl_StepSize, curUV.x += swgl_StepSize) { + commit_blend_span( + buf, gaussianBlurHorizontal

(sampler, curUV, bounds.x, bounds.z, + radius, coeffs.x, coeffs.y)); + } + } else { + for (; curUV.x + swgl_StepSize <= endX; + buf += swgl_StepSize, curUV.x += swgl_StepSize) { + commit_blend_span( + buf, gaussianBlurVertical

(sampler, curUV, bounds.y, bounds.w, + radius, coeffs.x, coeffs.y)); + } + } + return curUV.x - startX; +} + +#define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs) \ + do { \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendGaussianBlur(s, p, uv_rect, swgl_Out##format, \ + swgl_SpanLength, hori, radius, coeffs); \ + } else { \ + drawn = blendGaussianBlur(s, p, uv_rect, swgl_Out##format, \ + swgl_SpanLength, hori, radius, coeffs); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \ + swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs) +#define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \ + swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs) + +// Convert and pack planar YUV samples to RGB output using a color space +static ALWAYS_INLINE PackedRGBA8 convertYUV(const YUVMatrix& rgb_from_ycbcr, + U16 y, U16 u, U16 v) { + auto yy = V8(zip(y, y)); + auto uv = V8(zip(u, v)); + return rgb_from_ycbcr.convert(yy, uv); +} + +// Helper functions to sample from planar YUV textures before converting to RGB +template +static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, + const YUVMatrix& rgb_from_ycbcr, + UNUSED int rescaleFactor) { + switch (sampler0->format) { + case TextureFormat::RGBA8: { + auto planar = textureLinearPlanarRGBA8(sampler0, uv0); + return convertYUV(rgb_from_ycbcr, highHalf(planar.rg), lowHalf(planar.rg), + lowHalf(planar.ba)); + } + case TextureFormat::YUV422: { + auto planar = textureLinearPlanarYUV422(sampler0, uv0); + return convertYUV(rgb_from_ycbcr, planar.y, planar.u, planar.v); + } + default: + assert(false); + return PackedRGBA8(0); + } +} + +template +static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, + const vec4_scalar& uv_rect0, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, C color = C()) { + if (!swgl_isTextureLinear(sampler0)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); + const auto rgb_from_ycbcr = + YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); + auto c = packColor(buf, color); + auto* end = buf + span; + for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) { + commit_blend_span( + buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), + rgb_from_ycbcr, rescaleFactor), + c)); + } + return span; +} + +template +static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, + ivec2 uv1, + const YUVMatrix& rgb_from_ycbcr, + int rescaleFactor) { + switch (sampler1->format) { + case TextureFormat::RG8: { + assert(sampler0->format == TextureFormat::R8); + auto y = textureLinearUnpackedR8(sampler0, uv0); + auto planar = textureLinearPlanarRG8(sampler1, uv1); + return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.rg), + highHalf(planar.rg)); + } + case TextureFormat::RGBA8: { + assert(sampler0->format == TextureFormat::R8); + auto y = textureLinearUnpackedR8(sampler0, uv0); + auto planar = textureLinearPlanarRGBA8(sampler1, uv1); + return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.ba), + highHalf(planar.rg)); + } + case TextureFormat::RG16: { + assert(sampler0->format == TextureFormat::R16); + // The rescaling factor represents how many bits to add to renormalize the + // texture to 16 bits, and so the color depth is actually 16 minus the + // rescaling factor. + // Need to right shift the sample by the amount of bits over 8 it + // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit + // of precision at the low end already, hence 1 is subtracted from the + // color depth. + int colorDepth = 16 - rescaleFactor; + int rescaleBits = (colorDepth - 1) - 8; + auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits; + auto uv = textureLinearUnpackedRG16(sampler1, uv1) >> rescaleBits; + return rgb_from_ycbcr.convert(zip(y, y), uv); + } + default: + assert(false); + return PackedRGBA8(0); + } +} + +template +static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, + const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, + const vec4_scalar& uv_rect1, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, C color = C()) { + if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); + LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); + const auto rgb_from_ycbcr = + YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); + auto c = packColor(buf, color); + auto* end = buf + span; + for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) { + commit_blend_span( + buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), + sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), + rgb_from_ycbcr, rescaleFactor), + c)); + } + return span; +} + +template +static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, + ivec2 uv1, S2 sampler2, ivec2 uv2, + const YUVMatrix& rgb_from_ycbcr, + int rescaleFactor) { + assert(sampler0->format == sampler1->format && + sampler0->format == sampler2->format); + switch (sampler0->format) { + case TextureFormat::R8: { + auto y = textureLinearUnpackedR8(sampler0, uv0); + auto u = textureLinearUnpackedR8(sampler1, uv1); + auto v = textureLinearUnpackedR8(sampler2, uv2); + return convertYUV(rgb_from_ycbcr, y, u, v); + } + case TextureFormat::R16: { + // The rescaling factor represents how many bits to add to renormalize the + // texture to 16 bits, and so the color depth is actually 16 minus the + // rescaling factor. + // Need to right shift the sample by the amount of bits over 8 it + // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit + // of precision at the low end already, hence 1 is subtracted from the + // color depth. + int colorDepth = 16 - rescaleFactor; + int rescaleBits = (colorDepth - 1) - 8; + auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits; + auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits; + auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits; + return convertYUV(rgb_from_ycbcr, U16(y), U16(u), U16(v)); + } + default: + assert(false); + return PackedRGBA8(0); + } +} + +// Fallback helper for when we can't specifically accelerate YUV with +// composition. +template +static void blendYUVFallback(P* buf, int span, S0 sampler0, vec2 uv0, + vec2_scalar uv_step0, vec2_scalar min_uv0, + vec2_scalar max_uv0, S1 sampler1, vec2 uv1, + vec2_scalar uv_step1, vec2_scalar min_uv1, + vec2_scalar max_uv1, S2 sampler2, vec2 uv2, + vec2_scalar uv_step2, vec2_scalar min_uv2, + vec2_scalar max_uv2, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, C color) { + const auto rgb_from_ycbcr = + YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); + for (auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0, + uv1 += uv_step1, uv2 += uv_step2) { + commit_blend_span( + buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), + sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), + sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)), + rgb_from_ycbcr, rescaleFactor), + color)); + } +} + +template +static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, + const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, + const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2, + const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, C color = C()) { + if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || + !swgl_isTextureLinear(sampler2)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); + LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); + LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); + auto c = packColor(buf, color); + blendYUVFallback(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0, + sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, + uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias, + rgb_from_debiased_ycbcr, rescaleFactor, c); + return span; +} + +// A variant of the blendYUV that attempts to reuse the inner loops from the +// CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on +// the source data, which in turn allows it to be much faster than blendYUV. +// At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer +// and that no color scaling is applied, which we can accomplish via template +// specialization. We need to further validate inside that texture formats +// and dimensions are sane for video and that the video is axis-aligned before +// acceleration can proceed. +template +static int blendYUV(uint32_t* buf, int span, sampler2DRect sampler0, vec2 uv0, + const vec4_scalar& uv_rect0, sampler2DRect sampler1, + vec2 uv1, const vec4_scalar& uv_rect1, + sampler2DRect sampler2, vec2 uv2, + const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, NoColor noColor = NoColor()) { + if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || + !swgl_isTextureLinear(sampler2)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); + LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); + LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); + auto* end = buf + span; + // CompositeYUV imposes further restrictions on the source textures, such that + // the the Y/U/V samplers must all have a matching format, the U/V samplers + // must have matching sizes and sample coordinates, and there must be no + // change in row across the entire span. + if (sampler0->format == sampler1->format && + sampler1->format == sampler2->format && + sampler1->width == sampler2->width && + sampler1->height == sampler2->height && uv_step0.y == 0 && + uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 && + uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) { + // CompositeYUV does not support a clamp rect, so we must take care to + // advance till we're inside the bounds of the clamp rect. + int outside = min(int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x, + (min_uv1.x - uv1.x.x) / uv_step1.x))), + (end - buf) / swgl_StepSize); + if (outside > 0) { + blendYUVFallback(buf, outside * swgl_StepSize, sampler0, uv0, + uv_step0, min_uv0, max_uv0, sampler1, uv1, + uv_step1, min_uv1, max_uv1, sampler2, uv2, + uv_step2, min_uv2, max_uv2, ycbcr_bias, + rgb_from_debiased_ycbcr, rescaleFactor, noColor); + buf += outside * swgl_StepSize; + uv0.x += outside * uv_step0.x; + uv1.x += outside * uv_step1.x; + uv2.x += outside * uv_step2.x; + } + // Find the amount of chunks inside the clamp rect before we hit the + // maximum. If there are any chunks inside, we can finally dispatch to + // CompositeYUV. + int inside = min(int(min((max_uv0.x - uv0.x.x) / uv_step0.x, + (max_uv1.x - uv1.x.x) / uv_step1.x)), + (end - buf) / swgl_StepSize); + if (inside > 0) { + // We need the color depth, which is relative to the texture format and + // rescale factor. + int colorDepth = + (sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor; + // Finally, call the inner loop of CompositeYUV. + const auto rgb_from_ycbcr = + YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); + linear_row_yuv( + buf, inside * swgl_StepSize, sampler0, force_scalar(uv0), + uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1), + uv_step1.x / swgl_StepSize, colorDepth, rgb_from_ycbcr); + // Now that we're done, advance past the processed inside portion. + buf += inside * swgl_StepSize; + uv0.x += inside * uv_step0.x; + uv1.x += inside * uv_step1.x; + uv2.x += inside * uv_step2.x; + } + } + // We either got here because we have some samples outside the clamp rect, or + // because some of the preconditions were not satisfied. Process whatever is + // left of the span. + blendYUVFallback(buf, end - buf, sampler0, uv0, uv_step0, min_uv0, + max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, + sampler2, uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias, + rgb_from_debiased_ycbcr, rescaleFactor, noColor); + return span; +} + +// Commit a single chunk of a YUV surface represented by multiple planar +// textures. This requires a color space specifier selecting how to convert +// from YUV to RGB output. In the case of HDR formats, a rescaling factor +// selects how many bits of precision must be utilized on conversion. See the +// sampleYUV dispatcher functions for the various supported plane +// configurations this intrinsic accepts. +#define swgl_commitTextureLinearYUV(...) \ + do { \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendYUV(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ + } else { \ + drawn = blendYUV(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ + } \ + swgl_OutRGBA8 += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) + +// Commit a single chunk of a YUV surface scaled by a color. +#define swgl_commitTextureLinearColorYUV(...) \ + swgl_commitTextureLinearYUV(__VA_ARGS__) + +// Each gradient stops entry is a pair of RGBA32F start color and end step. +struct GradientStops { + Float startColor; + union { + Float stepColor; + vec4_scalar stepData; + }; + + // Whether this gradient entry can be merged with an adjacent entry. The + // step will be equal with the adjacent step if and only if they can be + // merged, or rather, that the stops are actually part of a single larger + // gradient. + bool can_merge(const GradientStops& next) const { + return stepData == next.stepData; + } + + // Get the interpolated color within the entry based on the offset from its + // start. + Float interpolate(float offset) const { + return startColor + stepColor * offset; + } + + // Get the end color of the entry where interpolation stops. + Float end_color() const { return startColor + stepColor; } +}; + +// Checks if a gradient table of the specified size exists at the UV coords of +// the address within an RGBA32F texture. If so, a linear address within the +// texture is returned that may be used to sample the gradient table later. If +// the address doesn't describe a valid gradient, then a negative value is +// returned. +static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address, + int entries) { + return sampler->format == TextureFormat::RGBA32F && address.y >= 0 && + address.y < int(sampler->height) && address.x >= 0 && + address.x < int(sampler->width) && entries > 0 && + address.x + + int(sizeof(GradientStops) / sizeof(Float)) * entries <= + int(sampler->width) + ? address.y * sampler->stride + address.x * 4 + : -1; +} + +static inline WideRGBA8 sampleGradient(sampler2D sampler, int address, + Float entry) { + assert(sampler->format == TextureFormat::RGBA32F); + assert(address >= 0 && address < int(sampler->height * sampler->stride)); + // Get the integer portion of the entry index to find the entry colors. + I32 index = cast(entry); + // Use the fractional portion of the entry index to control blending between + // entry colors. + Float offset = entry - cast(index); + // Every entry is a pair of colors blended by the fractional offset. + assert(test_all(index >= 0 && + index * int(sizeof(GradientStops) / sizeof(Float)) < + int(sampler->width))); + GradientStops* stops = (GradientStops*)&sampler->buf[address]; + // Blend between the colors for each SIMD lane, then pack them to RGBA8 + // result. Since the layout of the RGBA8 framebuffer is actually BGRA while + // the gradient table has RGBA colors, swizzling is required. + return combine( + packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw), + round_pixel(stops[index.y].interpolate(offset.y).zyxw)), + packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw), + round_pixel(stops[index.w].interpolate(offset.w).zyxw))); +} + +// Samples a gradient entry from the gradient at the provided linearized +// address. The integer portion of the entry index is used to find the entry +// within the table whereas the fractional portion is used to blend between +// adjacent table entries. +#define swgl_commitGradientRGBA8(sampler, address, entry) \ + swgl_commitChunk(RGBA8, sampleGradient(sampler, address, entry)) + +// Variant that allows specifying a color multiplier of the gradient result. +#define swgl_commitGradientColorRGBA8(sampler, address, entry, color) \ + swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \ + packColor(swgl_OutRGBA, color))) + +// Samples an entire span of a linear gradient by crawling the gradient table +// and looking for consecutive stops that can be merged into a single larger +// gradient, then interpolating between those larger gradients within the span. +template +static bool commitLinearGradient(sampler2D sampler, int address, float size, + bool tileRepeat, bool gradientRepeat, vec2 pos, + const vec2_scalar& scaleDir, float startOffset, + uint32_t* buf, int span) { + assert(sampler->format == TextureFormat::RGBA32F); + assert(address >= 0 && address < int(sampler->height * sampler->stride)); + GradientStops* stops = (GradientStops*)&sampler->buf[address]; + // Get the chunk delta from the difference in offset steps. This represents + // how far within the gradient table we advance for every step in output, + // normalized to gradient table size. + vec2_scalar posStep = dFdx(pos) * 4.0f; + float delta = dot(posStep, scaleDir); + if (!isfinite(delta)) { + return false; + } + // If we have a repeating brush, then the position will be modulo the [0,1) + // interval. Compute coefficients that can be used to quickly evaluate the + // distance to the interval boundary where the offset will wrap. + vec2_scalar distCoeffsX = {0.25f * span, 0.0f}; + vec2_scalar distCoeffsY = distCoeffsX; + if (tileRepeat) { + if (posStep.x != 0.0f) { + distCoeffsX = vec2_scalar{step(0.0f, posStep.x), 1.0f} * recip(posStep.x); + } + if (posStep.y != 0.0f) { + distCoeffsY = vec2_scalar{step(0.0f, posStep.y), 1.0f} * recip(posStep.y); + } + } + for (; span > 0;) { + // Try to process as many chunks as are within the span if possible. + float chunks = 0.25f * span; + vec2 repeatPos = pos; + if (tileRepeat) { + // If this is a repeating brush, then limit the chunks to not cross the + // interval boundaries. + repeatPos = fract(pos); + chunks = min(chunks, distCoeffsX.x - repeatPos.x.x * distCoeffsX.y); + chunks = min(chunks, distCoeffsY.x - repeatPos.y.x * distCoeffsY.y); + } + // Compute the gradient offset from the position. + Float offset = + repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset; + // If repeat is desired, we need to limit the offset to a fractional value. + if (gradientRepeat) { + offset = fract(offset); + } + // To properly handle both clamping and repeating of the table offset, we + // need to ensure we don't run past the 0 and 1 points. Here we compute the + // intercept points depending on whether advancing forwards or backwards in + // the gradient table to ensure the chunk count is limited by the amount + // before intersection. If there is no delta, then we compute no intercept. + float startEntry; + int minIndex, maxIndex; + if (offset.x < 0) { + // If we're below the gradient table, use the first color stop. We can + // only intercept the table if walking forward. + startEntry = 0; + minIndex = int(startEntry); + maxIndex = minIndex; + if (delta > 0) { + chunks = min(chunks, -offset.x / delta); + } + } else if (offset.x < 1) { + // Otherwise, we're inside the gradient table. Depending on the direction + // we're walking the the table, we may intersect either the 0 or 1 offset. + // Compute the start entry based on our initial offset, and compute the + // end entry based on the available chunks limited by intercepts. Clamp + // them into the valid range of the table. + startEntry = 1.0f + offset.x * size; + if (delta < 0) { + chunks = min(chunks, -offset.x / delta); + } else if (delta > 0) { + chunks = min(chunks, (1 - offset.x) / delta); + } + float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size, + 0.0f, 1.0f + size); + // Now that we know the range of entries we need to sample, we want to + // find the largest possible merged gradient within that range. Depending + // on which direction we are advancing in the table, we either walk up or + // down the table trying to merge the current entry with the adjacent + // entry. We finally limit the chunks to only sample from this merged + // gradient. + minIndex = int(startEntry); + maxIndex = minIndex; + if (delta > 0) { + while (maxIndex + 1 < endEntry && + stops[maxIndex].can_merge(stops[maxIndex + 1])) { + maxIndex++; + } + chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size)); + } else if (delta < 0) { + while (minIndex - 1 > endEntry && + stops[minIndex - 1].can_merge(stops[minIndex])) { + minIndex--; + } + chunks = min(chunks, (minIndex - startEntry) / (delta * size)); + } + } else { + // If we're above the gradient table, use the last color stop. We can + // only intercept the table if walking backward. + startEntry = 1.0f + size; + minIndex = int(startEntry); + maxIndex = minIndex; + if (delta < 0) { + chunks = min(chunks, (1 - offset.x) / delta); + } + } + // If there are any amount of whole chunks of a merged gradient found, + // then we want to process that as a single gradient span with the start + // and end colors from the min and max entries. + if (chunks >= 1.0f) { + int inside = int(chunks); + // Sample the start color from the min entry and the end color from the + // max entry of the merged gradient. These are scaled to a range of + // 0..0xFF00, as that is the largest shifted value that can fit in a U16. + // Since we are only doing addition with the step value, we can still + // represent negative step values without having to use an explicit sign + // bit, as the result will still come out the same, allowing us to gain an + // extra bit of precision. We will later shift these into 8 bit output + // range while committing the span, but stepping with higher precision to + // avoid banding. We convert from RGBA to BGRA here to avoid doing this in + // the inner loop. + auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00); + auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00); + // Get the color range of the merged gradient, normalized to its size. + auto colorRangeF = + (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex)); + // Compute the actual starting color of the current start offset within + // the merged gradient. The value 0.5 is added to the low bits (0x80) so + // that the color will effectively round to the nearest increment below. + auto colorF = + minColorF + colorRangeF * (startEntry - minIndex) + float(0x80); + // Compute the portion of the color range that we advance on each chunk. + Float deltaColorF = colorRangeF * (delta * size); + // Quantize the color delta and current color. These have already been + // scaled to the 0..0xFF00 range, so we just need to round them to U16. + auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16)); + for (int remaining = inside;;) { + auto color = + combine(CONVERT(round_pixel(colorF, 1), U16), + CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16), + CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16), + CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16)); + // Finally, step the current color through the output chunks, shifting + // it into 8 bit range and outputting as we go. Only process a segment + // at a time to avoid overflowing 8-bit precision due to rounding of + // deltas. + int segment = min(remaining, 256 / 4); + for (auto* end = buf + segment * 4; buf < end; buf += 4) { + commit_blend_span(buf, bit_cast(color >> 8)); + color += deltaColor; + } + remaining -= segment; + if (remaining <= 0) { + break; + } + colorF += deltaColorF * segment; + } + // Deduct the number of chunks inside the gradient from the remaining + // overall span. If we exhausted the span, bail out. + span -= inside * 4; + if (span <= 0) { + break; + } + // Otherwise, assume we're in a transitional section of the gradient that + // will probably require per-sample table lookups, so fall through below. + // We need to re-evaluate the position and offset first, though. + pos += posStep * float(inside); + repeatPos = tileRepeat ? fract(pos) : pos; + offset = + repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset; + if (gradientRepeat) { + offset = fract(offset); + } + } + // If we get here, there were no whole chunks of a merged gradient found + // that we could process, but we still have a non-zero amount of span left. + // That means we have segments of gradient that begin or end at the current + // entry we're on. For this case, we just fall back to sampleGradient which + // will calculate a table entry for each sample, assuming the samples may + // have different table entries. + Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); + commit_blend_span(buf, sampleGradient(sampler, address, entry)); + span -= 4; + buf += 4; + pos += posStep; + } + return true; +} + +// Commits an entire span of a linear gradient, given the address of a table +// previously resolved with swgl_validateGradient. The size of the inner portion +// of the table is given, assuming the table start and ends with a single entry +// each to deal with clamping. Repeating will be handled if necessary. The +// initial offset within the table is used to designate where to start the span +// and how to step through the gradient table. +#define swgl_commitLinearGradientRGBA8(sampler, address, size, tileRepeat, \ + gradientRepeat, pos, scaleDir, \ + startOffset) \ + do { \ + bool drawn = false; \ + if (blend_key) { \ + drawn = commitLinearGradient( \ + sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ + startOffset, swgl_OutRGBA8, swgl_SpanLength); \ + } else { \ + drawn = commitLinearGradient( \ + sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ + startOffset, swgl_OutRGBA8, swgl_SpanLength); \ + } \ + if (drawn) { \ + swgl_OutRGBA8 += swgl_SpanLength; \ + swgl_SpanLength = 0; \ + } \ + } while (0) + +template +static ALWAYS_INLINE V fastSqrt(V v) { + if (CLAMP) { + // Clamp to avoid zero or negative. + v = max(v, V(1.0e-12f)); + } +#if USE_SSE2 || USE_NEON + return v * inversesqrt(v); +#else + return sqrt(v); +#endif +} + +template +static ALWAYS_INLINE auto fastLength(V v) { + return fastSqrt(dot(v, v)); +} + +// Samples an entire span of a radial gradient by crawling the gradient table +// and looking for consecutive stops that can be merged into a single larger +// gradient, then interpolating between those larger gradients within the span +// based on the computed position relative to a radius. +template +static bool commitRadialGradient(sampler2D sampler, int address, float size, + bool repeat, vec2 pos, float radius, + uint32_t* buf, int span) { + assert(sampler->format == TextureFormat::RGBA32F); + assert(address >= 0 && address < int(sampler->height * sampler->stride)); + GradientStops* stops = (GradientStops*)&sampler->buf[address]; + // clang-format off + // Given position p, delta d, and radius r, we need to repeatedly solve the + // following quadratic for the pixel offset t: + // length(p + t*d) = r + // (px + t*dx)^2 + (py + t*dy)^2 = r^2 + // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is: + // t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0 + // t^2*d.d + t*2*d.p + (p.p-r^2) = 0 + // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to: + // t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d) + // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so + // we cache them below for faster computation. + // + // The quadratic has two solutions, representing the span intersecting the + // given radius of gradient, which can occur at two offsets. If there is only + // one solution (where b^2-4ac = 0), this represents the point at which the + // span runs tangent to the radius. This middle point is significant in that + // before it, we walk down the gradient ramp, and after it, we walk up the + // ramp. + // clang-format on + vec2_scalar pos0 = {pos.x.x, pos.y.x}; + vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x}; + float deltaDelta = dot(delta, delta); + if (!isfinite(deltaDelta) || !isfinite(radius)) { + return false; + } + float invDelta, middleT, middleB; + if (deltaDelta > 0) { + invDelta = 1.0f / deltaDelta; + middleT = -dot(delta, pos0) * invDelta; + middleB = middleT * middleT - dot(pos0, pos0) * invDelta; + } else { + // If position is invariant, just set the coefficients so the quadratic + // always reduces to the end of the span. + invDelta = 0.0f; + middleT = float(span); + middleB = 0.0f; + } + // We only want search for merged gradients up to the minimum of either the + // mid-point or the span length. Cache those offsets here as they don't vary + // in the inner loop. + Float middleEndRadius = fastLength( + pos0 + delta * (Float){middleT, float(span), 0.0f, 0.0f}); + float middleRadius = span < middleT ? middleEndRadius.y : middleEndRadius.x; + float endRadius = middleEndRadius.y; + // Convert delta to change in position per chunk. + delta *= 4; + deltaDelta *= 4 * 4; + // clang-format off + // Given current position p and delta d, we reduce: + // length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p)) + // where dot(p+d,p+d) can be accumulated as: + // (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2) + // = p.p + 2p.d + d.d + // Since p increases by d every loop iteration, p.d increases by d.d, and thus + // we can accumulate d.d to calculate 2p.d, then allowing us to get the next + // dot-product by adding it to dot-product p.p of the prior iteration. This + // saves us some multiplications and an expensive sqrt inside the inner loop. + // clang-format on + Float dotPos = dot(pos, pos); + Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta; + float deltaDelta2 = 2.0f * deltaDelta; + for (int t = 0; t < span;) { + // Compute the gradient table offset from the current position. + Float offset = fastSqrt(dotPos) - radius; + float startRadius = radius; + // If repeat is desired, we need to limit the offset to a fractional value. + if (repeat) { + // The non-repeating radius at which the gradient table actually starts, + // radius + floor(offset) = radius + (offset - fract(offset)). + startRadius += offset.x; + offset = fract(offset); + startRadius -= offset.x; + } + // We need to find the min/max index in the table of the gradient we want to + // use as well as the intercept point where we leave this gradient. + float intercept = -1; + int minIndex = 0; + int maxIndex = int(1.0f + size); + if (offset.x < 0) { + // If inside the inner radius of the gradient table, then use the first + // stop. Set the intercept to advance forward to the start of the gradient + // table. + maxIndex = minIndex; + if (t >= middleT) { + intercept = radius; + } + } else if (offset.x < 1) { + // Otherwise, we're inside the valid part of the gradient table. + minIndex = int(1.0f + offset.x * size); + maxIndex = minIndex; + // Find the offset in the gradient that corresponds to the search limit. + // We only search up to the minimum of either the mid-point or the span + // length. Get the table index that corresponds to this offset, clamped so + // that we avoid hitting the beginning (0) or end (1 + size) of the table. + float searchOffset = + (t >= middleT ? endRadius : middleRadius) - startRadius; + int searchIndex = int(clamp(1.0f + size * searchOffset, 1.0f, size)); + // If we are past the mid-point, walk up the gradient table trying to + // merge stops. If we're below the mid-point, we need to walk down the + // table. We note the table index at which we need to look for an + // intercept to determine a valid span. + if (t >= middleT) { + while (maxIndex + 1 <= searchIndex && + stops[maxIndex].can_merge(stops[maxIndex + 1])) { + maxIndex++; + } + intercept = maxIndex + 1; + } else { + while (minIndex - 1 >= searchIndex && + stops[minIndex - 1].can_merge(stops[minIndex])) { + minIndex--; + } + intercept = minIndex; + } + // Convert from a table index into units of radius from the center of the + // gradient. + intercept = clamp((intercept - 1.0f) / size, 0.0f, 1.0f) + startRadius; + } else { + // If outside the outer radius of the gradient table, then use the last + // stop. Set the intercept to advance toward the valid part of the + // gradient table if going in, or just run to the end of the span if going + // away from the gradient. + minIndex = maxIndex; + if (t < middleT) { + intercept = radius + 1; + } + } + // Solve the quadratic for t to find where the merged gradient ends. If no + // intercept is found, just go to the middle or end of the span. + float endT = t >= middleT ? span : min(span, int(middleT)); + if (intercept >= 0) { + float b = middleB + intercept * intercept * invDelta; + if (b > 0) { + b = fastSqrt(b); + endT = min(endT, t >= middleT ? middleT + b : middleT - b); + } else { + // Due to the imprecision of fastSqrt in offset calculations, solving + // the quadratic may fail. However, if the discriminant is still close + // to 0, then just assume it is 0. + endT = min(endT, middleT); + } + } + // Figure out how many chunks are actually inside the merged gradient. + if (t + 4.0f <= endT) { + int inside = int(endT - t) & ~3; + // Convert start and end colors to BGRA and scale to 0..255 range later. + auto minColorF = stops[minIndex].startColor.zyxw * 255.0f; + auto maxColorF = stops[maxIndex].end_color().zyxw * 255.0f; + // Compute the change in color per change in gradient offset. + auto deltaColorF = + (maxColorF - minColorF) * (size / (maxIndex + 1 - minIndex)); + // Subtract off the color difference of the beginning of the current span + // from the beginning of the gradient. + Float colorF = + minColorF - deltaColorF * (startRadius + (minIndex - 1) / size); + // Finally, walk over the span accumulating the position dot product and + // getting its sqrt as an offset into the color ramp. Since we're already + // in BGRA format and scaled to 255, we just need to round to an integer + // and pack down to pixel format. + for (auto* end = buf + inside; buf < end; buf += 4) { + Float offsetG = fastSqrt(dotPos); + commit_blend_span( + buf, + combine( + packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1), + round_pixel(colorF + deltaColorF * offsetG.y, 1)), + packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1), + round_pixel(colorF + deltaColorF * offsetG.w, 1)))); + dotPos += dotPosDelta; + dotPosDelta += deltaDelta2; + } + // Advance past the portion of gradient we just processed. + t += inside; + // If we hit the end of the span, exit out now. + if (t >= span) { + break; + } + // Otherwise, we are most likely in a transitional section of the gradient + // between stops that will likely require doing per-sample table lookups. + // Rather than having to redo all the searching above to figure that out, + // just assume that to be the case and fall through below to doing the + // table lookups to hopefully avoid an iteration. + offset = fastSqrt(dotPos) - radius; + if (repeat) { + offset = fract(offset); + } + } + // If we got here, that means we still have span left to process but did not + // have any whole chunks that fell within a merged gradient. Just fall back + // to doing a table lookup for each sample. + Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); + commit_blend_span(buf, sampleGradient(sampler, address, entry)); + buf += 4; + t += 4; + dotPos += dotPosDelta; + dotPosDelta += deltaDelta2; + } + return true; +} + +// Commits an entire span of a radial gradient similar to +// swglcommitLinearGradient, but given a varying 2D position scaled to +// gradient-space and a radius at which the distance from the origin maps to the +// start of the gradient table. +#define swgl_commitRadialGradientRGBA8(sampler, address, size, repeat, pos, \ + radius) \ + do { \ + bool drawn = false; \ + if (blend_key) { \ + drawn = \ + commitRadialGradient(sampler, address, size, repeat, pos, \ + radius, swgl_OutRGBA8, swgl_SpanLength); \ + } else { \ + drawn = \ + commitRadialGradient(sampler, address, size, repeat, pos, \ + radius, swgl_OutRGBA8, swgl_SpanLength); \ + } \ + if (drawn) { \ + swgl_OutRGBA8 += swgl_SpanLength; \ + swgl_SpanLength = 0; \ + } \ + } while (0) + +// Extension to set a clip mask image to be sampled during blending. The offset +// specifies the positioning of the clip mask image relative to the viewport +// origin. The bounding box specifies the rectangle relative to the clip mask's +// origin that constrains sampling within the clip mask. Blending must be +// enabled for this to work. +static sampler2D swgl_ClipMask = nullptr; +static IntPoint swgl_ClipMaskOffset = {0, 0}; +static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0}; +#define swgl_clipMask(mask, offset, bb_origin, bb_size) \ + do { \ + if (bb_size != vec2_scalar(0.0f, 0.0f)) { \ + swgl_ClipFlags |= SWGL_CLIP_FLAG_MASK; \ + swgl_ClipMask = mask; \ + swgl_ClipMaskOffset = make_ivec2(offset); \ + swgl_ClipMaskBounds = \ + IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \ + } \ + } while (0) + +// Extension to enable anti-aliasing for the given edges of a quad. +// Blending must be enable for this to work. +static int swgl_AAEdgeMask = 0; + +static ALWAYS_INLINE int calcAAEdgeMask(bool on) { return on ? 0xF : 0; } +static ALWAYS_INLINE int calcAAEdgeMask(int mask) { return mask; } +static ALWAYS_INLINE int calcAAEdgeMask(bvec4_scalar mask) { + return (mask.x ? 1 : 0) | (mask.y ? 2 : 0) | (mask.z ? 4 : 0) | + (mask.w ? 8 : 0); +} + +#define swgl_antiAlias(edges) \ + do { \ + swgl_AAEdgeMask = calcAAEdgeMask(edges); \ + if (swgl_AAEdgeMask) { \ + swgl_ClipFlags |= SWGL_CLIP_FLAG_AA; \ + } \ + } while (0) + +#define swgl_blendDropShadow(color) \ + do { \ + swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ + swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_DROP_SHADOW); \ + swgl_BlendColorRGBA8 = packColor(color); \ + } while (0) + +#define swgl_blendSubpixelText(color) \ + do { \ + swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ + swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_SUBPIXEL_TEXT); \ + swgl_BlendColorRGBA8 = packColor(color); \ + swgl_BlendAlphaRGBA8 = alphas(swgl_BlendColorRGBA8); \ + } while (0) + +// Dispatch helper used by the GLSL translator to swgl_drawSpan functions. +// The number of pixels committed is tracked by checking for the difference in +// swgl_SpanLength. Any varying interpolants used will be advanced past the +// committed part of the span in case the fragment shader must be executed for +// any remaining pixels that were not committed by the span shader. +#define DISPATCH_DRAW_SPAN(self, format) \ + do { \ + int total = self->swgl_SpanLength; \ + self->swgl_drawSpan##format(); \ + int drawn = total - self->swgl_SpanLength; \ + if (drawn) self->step_interp_inputs(drawn); \ + return drawn; \ + } while (0) diff --git a/gfx/wr/swgl/src/swgl_fns.rs b/gfx/wr/swgl/src/swgl_fns.rs new file mode 100644 index 0000000000..79669bd205 --- /dev/null +++ b/gfx/wr/swgl/src/swgl_fns.rs @@ -0,0 +1,2489 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#![allow(unused_variables)] + +use gleam::gl::*; +use std::ffi::{CStr, CString}; +use std::os::raw::{c_char, c_int, c_void}; +use std::ptr; +use std::str; + +#[allow(unused)] +macro_rules! debug { + ($($x:tt)*) => {}; +} + +#[repr(C)] +struct LockedTexture { + _private: [u8; 0], +} + +#[allow(dead_code)] +extern "C" { + fn ActiveTexture(texture: GLenum); + fn BindTexture(target: GLenum, texture: GLuint); + fn BindBuffer(target: GLenum, buffer: GLuint); + fn BindVertexArray(vao: GLuint); + fn BindFramebuffer(target: GLenum, fb: GLuint); + fn BindRenderbuffer(target: GLenum, rb: GLuint); + fn BlendFunc(srgb: GLenum, drgb: GLenum, sa: GLenum, da: GLenum); + fn BlendColor(r: GLfloat, g: GLfloat, b: GLfloat, a: GLfloat); + fn BlendEquation(mode: GLenum); + fn Enable(cap: GLenum); + fn Disable(cap: GLenum); + fn GenQueries(n: GLsizei, result: *mut GLuint); + fn BeginQuery(target: GLenum, id: GLuint); + fn EndQuery(target: GLenum); + fn GetQueryObjectui64v(id: GLuint, pname: GLenum, params: *mut GLuint64); + fn GenBuffers(n: i32, result: *mut GLuint); + fn GenTextures(n: i32, result: *mut GLuint); + fn GenFramebuffers(n: i32, result: *mut GLuint); + fn GenRenderbuffers(n: i32, result: *mut GLuint); + fn BufferData(target: GLenum, size: GLsizeiptr, data: *const GLvoid, usage: GLenum); + fn BufferSubData(target: GLenum, offset: GLintptr, size: GLsizeiptr, data: *const GLvoid); + fn MapBuffer(target: GLenum, access: GLbitfield) -> *mut c_void; + fn MapBufferRange( + target: GLenum, + offset: GLintptr, + length: GLsizeiptr, + access: GLbitfield, + ) -> *mut c_void; + fn UnmapBuffer(target: GLenum) -> GLboolean; + fn TexStorage2D( + target: GLenum, + levels: GLint, + internal_format: GLenum, + width: GLsizei, + height: GLsizei, + ); + fn FramebufferTexture2D( + target: GLenum, + attachment: GLenum, + textarget: GLenum, + texture: GLuint, + level: GLint, + ); + fn CheckFramebufferStatus(target: GLenum) -> GLenum; + fn InvalidateFramebuffer(target: GLenum, num_attachments: GLsizei, attachments: *const GLenum); + fn TexImage2D( + target: GLenum, + level: GLint, + internal_format: GLint, + width: GLsizei, + height: GLsizei, + border: GLint, + format: GLenum, + ty: GLenum, + data: *const c_void, + ); + fn TexSubImage2D( + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + width: GLsizei, + height: GLsizei, + format: GLenum, + ty: GLenum, + data: *const c_void, + ); + fn GenerateMipmap(target: GLenum); + fn GetUniformLocation(program: GLuint, name: *const GLchar) -> GLint; + fn BindAttribLocation(program: GLuint, index: GLuint, name: *const GLchar); + fn GetAttribLocation(program: GLuint, name: *const GLchar) -> GLint; + fn GenVertexArrays(n: i32, result: *mut GLuint); + fn VertexAttribPointer( + index: GLuint, + size: GLint, + type_: GLenum, + normalized: GLboolean, + stride: GLsizei, + offset: *const GLvoid, + ); + fn VertexAttribIPointer( + index: GLuint, + size: GLint, + type_: GLenum, + stride: GLsizei, + offset: *const GLvoid, + ); + fn CreateShader(shader_type: GLenum) -> GLuint; + fn AttachShader(program: GLuint, shader: GLuint); + fn CreateProgram() -> GLuint; + fn Uniform1i(location: GLint, v0: GLint); + fn Uniform4fv(location: GLint, count: GLsizei, value: *const GLfloat); + fn UniformMatrix4fv( + location: GLint, + count: GLsizei, + transpose: GLboolean, + value: *const GLfloat, + ); + fn DrawElementsInstanced( + mode: GLenum, + count: GLsizei, + type_: GLenum, + indices: GLintptr, + instancecount: GLsizei, + ); + fn EnableVertexAttribArray(index: GLuint); + fn VertexAttribDivisor(index: GLuint, divisor: GLuint); + fn LinkProgram(program: GLuint); + fn GetLinkStatus(program: GLuint) -> GLint; + fn UseProgram(program: GLuint); + fn SetViewport(x: GLint, y: GLint, width: GLsizei, height: GLsizei); + fn FramebufferRenderbuffer( + target: GLenum, + attachment: GLenum, + renderbuffertarget: GLenum, + renderbuffer: GLuint, + ); + fn RenderbufferStorage(target: GLenum, internalformat: GLenum, width: GLsizei, height: GLsizei); + fn DepthMask(flag: GLboolean); + fn DepthFunc(func: GLenum); + fn SetScissor(x: GLint, y: GLint, width: GLsizei, height: GLsizei); + fn ClearColor(r: GLfloat, g: GLfloat, b: GLfloat, a: GLfloat); + fn ClearDepth(depth: GLdouble); + fn Clear(mask: GLbitfield); + fn ClearTexSubImage( + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + zoffset: GLint, + width: GLsizei, + height: GLsizei, + depth: GLsizei, + format: GLenum, + ty: GLenum, + data: *const c_void, + ); + fn ClearTexImage(target: GLenum, level: GLint, format: GLenum, ty: GLenum, data: *const c_void); + fn ClearColorRect( + fbo: GLuint, + xoffset: GLint, + yoffset: GLint, + width: GLsizei, + height: GLsizei, + r: GLfloat, + g: GLfloat, + b: GLfloat, + a: GLfloat, + ); + fn PixelStorei(name: GLenum, param: GLint); + fn ReadPixels( + x: GLint, + y: GLint, + width: GLsizei, + height: GLsizei, + format: GLenum, + ty: GLenum, + data: *mut c_void, + ); + fn Finish(); + fn ShaderSourceByName(shader: GLuint, name: *const GLchar); + fn TexParameteri(target: GLenum, pname: GLenum, param: GLint); + fn CopyImageSubData( + src_name: GLuint, + src_target: GLenum, + src_level: GLint, + src_x: GLint, + src_y: GLint, + src_z: GLint, + dst_name: GLuint, + dst_target: GLenum, + dst_level: GLint, + dst_x: GLint, + dst_y: GLint, + dst_z: GLint, + src_width: GLsizei, + src_height: GLsizei, + src_depth: GLsizei, + ); + fn CopyTexSubImage2D( + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + x: GLint, + y: GLint, + width: GLsizei, + height: GLsizei, + ); + fn BlitFramebuffer( + src_x0: GLint, + src_y0: GLint, + src_x1: GLint, + src_y1: GLint, + dst_x0: GLint, + dst_y0: GLint, + dst_x1: GLint, + dst_y1: GLint, + mask: GLbitfield, + filter: GLenum, + ); + fn GetIntegerv(pname: GLenum, params: *mut GLint); + fn GetBooleanv(pname: GLenum, params: *mut GLboolean); + fn GetString(name: GLenum) -> *const c_char; + fn GetStringi(name: GLenum, index: GLuint) -> *const c_char; + fn GetError() -> GLenum; + fn InitDefaultFramebuffer( + x: i32, + y: i32, + width: i32, + height: i32, + stride: i32, + buf: *mut c_void, + ); + fn GetColorBuffer( + fbo: GLuint, + flush: GLboolean, + width: *mut i32, + height: *mut i32, + stride: *mut i32, + ) -> *mut c_void; + fn ResolveFramebuffer(fbo: GLuint); + fn SetTextureBuffer( + tex: GLuint, + internal_format: GLenum, + width: GLsizei, + height: GLsizei, + stride: GLsizei, + buf: *mut c_void, + min_width: GLsizei, + min_height: GLsizei, + ); + fn SetTextureParameter(tex: GLuint, pname: GLenum, param: GLint); + fn DeleteTexture(n: GLuint); + fn DeleteRenderbuffer(n: GLuint); + fn DeleteFramebuffer(n: GLuint); + fn DeleteBuffer(n: GLuint); + fn DeleteVertexArray(n: GLuint); + fn DeleteQuery(n: GLuint); + fn DeleteShader(shader: GLuint); + fn DeleteProgram(program: GLuint); + fn LockFramebuffer(fbo: GLuint) -> *mut LockedTexture; + fn LockTexture(tex: GLuint) -> *mut LockedTexture; + fn LockResource(resource: *mut LockedTexture); + fn UnlockResource(resource: *mut LockedTexture); + fn GetResourceBuffer( + resource: *mut LockedTexture, + width: *mut i32, + height: *mut i32, + stride: *mut i32, + ) -> *mut c_void; + fn Composite( + locked_dst: *mut LockedTexture, + locked_src: *mut LockedTexture, + src_x: GLint, + src_y: GLint, + src_width: GLsizei, + src_height: GLsizei, + dst_x: GLint, + dst_y: GLint, + dst_width: GLsizei, + dst_height: GLsizei, + opaque: GLboolean, + flip_x: GLboolean, + flip_y: GLboolean, + filter: GLenum, + clip_x: GLint, + clip_y: GLint, + clip_width: GLsizei, + clip_height: GLsizei, + ); + fn CompositeYUV( + locked_dst: *mut LockedTexture, + locked_y: *mut LockedTexture, + locked_u: *mut LockedTexture, + locked_v: *mut LockedTexture, + color_space: YuvRangedColorSpace, + color_depth: GLuint, + src_x: GLint, + src_y: GLint, + src_width: GLsizei, + src_height: GLsizei, + dst_x: GLint, + dst_y: GLint, + dst_width: GLsizei, + dst_height: GLsizei, + flip_x: GLboolean, + flip_y: GLboolean, + clip_x: GLint, + clip_y: GLint, + clip_width: GLsizei, + clip_height: GLsizei, + ); + fn CreateContext() -> *mut c_void; + fn ReferenceContext(ctx: *mut c_void); + fn DestroyContext(ctx: *mut c_void); + fn MakeCurrent(ctx: *mut c_void); + fn ReportMemory(ctx: *mut c_void, size_of_op: unsafe extern "C" fn(ptr: *const c_void) -> usize) -> usize; +} + +#[derive(Clone, Copy)] +pub struct Context(*mut c_void); + +impl Context { + pub fn create() -> Self { + Context(unsafe { CreateContext() }) + } + + pub fn reference(&self) { + unsafe { + ReferenceContext(self.0); + } + } + + pub fn destroy(&self) { + unsafe { + DestroyContext(self.0); + } + } + + pub fn make_current(&self) { + unsafe { + MakeCurrent(self.0); + } + } + + pub fn init_default_framebuffer( + &self, + x: i32, + y: i32, + width: i32, + height: i32, + stride: i32, + buf: *mut c_void, + ) { + unsafe { + InitDefaultFramebuffer(x, y, width, height, stride, buf); + } + } + + pub fn get_color_buffer(&self, fbo: GLuint, flush: bool) -> (*mut c_void, i32, i32, i32) { + unsafe { + let mut width: i32 = 0; + let mut height: i32 = 0; + let mut stride: i32 = 0; + let data_ptr = GetColorBuffer( + fbo, + flush as GLboolean, + &mut width, + &mut height, + &mut stride, + ); + (data_ptr, width, height, stride) + } + } + + pub fn resolve_framebuffer(&self, fbo: GLuint) { + unsafe { + ResolveFramebuffer(fbo); + } + } + + pub fn clear_color_rect( + &self, + fbo: GLuint, + xoffset: GLint, + yoffset: GLint, + width: GLsizei, + height: GLsizei, + r: f32, + g: f32, + b: f32, + a: f32, + ) { + unsafe { + ClearColorRect(fbo, xoffset, yoffset, width, height, r, g, b, a); + } + } + + pub fn set_texture_buffer( + &self, + tex: GLuint, + internal_format: GLenum, + width: GLsizei, + height: GLsizei, + stride: GLsizei, + buf: *mut c_void, + min_width: GLsizei, + min_height: GLsizei, + ) { + unsafe { + SetTextureBuffer( + tex, + internal_format, + width, + height, + stride, + buf, + min_width, + min_height, + ); + } + } + + pub fn set_texture_parameter(&self, tex: GLuint, pname: GLenum, param: GLint) { + unsafe { + SetTextureParameter(tex, pname, param); + } + } + + pub fn lock_framebuffer(&self, fbo: GLuint) -> Option { + unsafe { + let resource = LockFramebuffer(fbo); + if resource != ptr::null_mut() { + Some(LockedResource(resource)) + } else { + None + } + } + } + + pub fn lock_texture(&self, tex: GLuint) -> Option { + unsafe { + let resource = LockTexture(tex); + if resource != ptr::null_mut() { + Some(LockedResource(resource)) + } else { + None + } + } + } + + pub fn report_memory(&self, size_of_op: unsafe extern "C" fn(ptr: *const c_void) -> usize) -> usize { + unsafe { ReportMemory(self.0, size_of_op) } + } +} + +impl From<*mut c_void> for Context { + fn from(ptr: *mut c_void) -> Self { + Context(ptr) + } +} + +impl From for *mut c_void { + fn from(ctx: Context) -> Self { + ctx.0 + } +} + +fn calculate_length(width: GLsizei, height: GLsizei, format: GLenum, pixel_type: GLenum) -> usize { + let colors = match format { + RED => 1, + RGB => 3, + BGR => 3, + + RGBA => 4, + BGRA => 4, + + ALPHA => 1, + R16 => 1, + LUMINANCE => 1, + DEPTH_COMPONENT => 1, + _ => panic!("unsupported format for read_pixels: {:?}", format), + }; + let depth = match pixel_type { + UNSIGNED_BYTE => 1, + UNSIGNED_SHORT => 2, + SHORT => 2, + FLOAT => 4, + UNSIGNED_INT_8_8_8_8_REV => 1, + _ => panic!("unsupported pixel_type for read_pixels: {:?}", pixel_type), + }; + + return (width * height * colors * depth) as usize; +} + +impl Gl for Context { + fn get_type(&self) -> GlType { + GlType::Gl + } + + fn buffer_data_untyped( + &self, + target: GLenum, + size: GLsizeiptr, + data: *const GLvoid, + usage: GLenum, + ) { + debug!( + "buffer_data_untyped {} {} {:?} {}", + target, size, data, usage + ); + //panic!(); + unsafe { + BufferData(target, size, data, usage); + } + } + + fn buffer_sub_data_untyped( + &self, + target: GLenum, + offset: isize, + size: GLsizeiptr, + data: *const GLvoid, + ) { + debug!( + "buffer_sub_data_untyped {} {} {} {:?}", + target, offset, size, data + ); + //panic!(); + unsafe { + BufferSubData(target, offset, size, data); + } + } + + fn map_buffer(&self, target: GLenum, access: GLbitfield) -> *mut c_void { + unsafe { MapBuffer(target, access) } + } + + fn map_buffer_range( + &self, + target: GLenum, + offset: GLintptr, + length: GLsizeiptr, + access: GLbitfield, + ) -> *mut c_void { + unsafe { MapBufferRange(target, offset, length, access) } + } + + fn unmap_buffer(&self, target: GLenum) -> GLboolean { + unsafe { UnmapBuffer(target) } + } + + fn shader_source(&self, shader: GLuint, strings: &[&[u8]]) { + //panic!(); + debug!("shader_source {}", shader); + //for s in strings { + // debug!("{}", str::from_utf8(s).unwrap()); + //} + //panic!(); + for s in strings { + let u = str::from_utf8(s).unwrap(); + const PREFIX: &'static str = "// shader: "; + if let Some(start) = u.find(PREFIX) { + if let Some(end) = u[start..].find('\n') { + let name = u[start + PREFIX.len()..start + end].trim(); + debug!("shader name: {}", name); + unsafe { + let c_string = CString::new(name).unwrap(); + ShaderSourceByName(shader, c_string.as_ptr()); + return; + } + } + } + } + panic!("unknown shader"); + } + + fn tex_buffer(&self, target: GLenum, internal_format: GLenum, buffer: GLuint) { + panic!(); + } + + fn read_buffer(&self, mode: GLenum) { + panic!(); + } + + fn read_pixels_into_buffer( + &self, + x: GLint, + y: GLint, + width: GLsizei, + height: GLsizei, + format: GLenum, + pixel_type: GLenum, + dst_buffer: &mut [u8], + ) { + // Assumes that the user properly allocated the size for dst_buffer. + assert!(calculate_length(width, height, format, pixel_type) == dst_buffer.len()); + + unsafe { + ReadPixels( + x, + y, + width, + height, + format, + pixel_type, + dst_buffer.as_mut_ptr() as *mut c_void, + ); + } + } + + fn read_pixels( + &self, + x: GLint, + y: GLint, + width: GLsizei, + height: GLsizei, + format: GLenum, + pixel_type: GLenum, + ) -> Vec { + let len = calculate_length(width, height, format, pixel_type); + let mut pixels: Vec = Vec::new(); + pixels.reserve(len); + unsafe { + pixels.set_len(len); + } + + self.read_pixels_into_buffer( + x, + y, + width, + height, + format, + pixel_type, + pixels.as_mut_slice(), + ); + + pixels + } + + unsafe fn read_pixels_into_pbo( + &self, + x: GLint, + y: GLint, + width: GLsizei, + height: GLsizei, + format: GLenum, + pixel_type: GLenum, + ) { + ReadPixels(x, y, width, height, format, pixel_type, ptr::null_mut()); + } + + fn sample_coverage(&self, value: GLclampf, invert: bool) { + panic!(); + } + + fn polygon_offset(&self, factor: GLfloat, units: GLfloat) { + panic!(); + } + + fn pixel_store_i(&self, name: GLenum, param: GLint) { + //panic!(); + debug!("pixel_store_i {:x} {}", name, param); + unsafe { + PixelStorei(name, param); + } + } + + fn gen_buffers(&self, n: GLsizei) -> Vec { + //panic!(); + let mut result = vec![0 as GLuint; n as usize]; + unsafe { + GenBuffers(n, result.as_mut_ptr()); + } + result + } + + fn gen_renderbuffers(&self, n: GLsizei) -> Vec { + debug!("gen_renderbuffers {}", n); + //panic!(); + let mut result = vec![0 as GLuint; n as usize]; + unsafe { + GenRenderbuffers(n, result.as_mut_ptr()); + } + result + } + + fn gen_framebuffers(&self, n: GLsizei) -> Vec { + //panic!(); + debug!("gen_framebuffers {}", n); + let mut result = vec![0 as GLuint; n as usize]; + unsafe { + GenFramebuffers(n, result.as_mut_ptr()); + } + result + } + + fn gen_textures(&self, n: GLsizei) -> Vec { + //panic!(); + let mut result = vec![0 as GLuint; n as usize]; + unsafe { + GenTextures(n, result.as_mut_ptr()); + } + result + } + + fn gen_vertex_arrays(&self, n: GLsizei) -> Vec { + //panic!(); + let mut result = vec![0 as GLuint; n as usize]; + unsafe { + GenVertexArrays(n, result.as_mut_ptr()); + } + result + } + + fn gen_vertex_arrays_apple(&self, n: GLsizei) -> Vec { + self.gen_vertex_arrays(n) + } + + fn gen_queries(&self, n: GLsizei) -> Vec { + let mut result = vec![0 as GLuint; n as usize]; + unsafe { + GenQueries(n, result.as_mut_ptr()); + } + result + } + + fn begin_query(&self, target: GLenum, id: GLuint) { + unsafe { + BeginQuery(target, id); + } + } + + fn end_query(&self, target: GLenum) { + unsafe { + EndQuery(target); + } + } + + fn query_counter(&self, id: GLuint, target: GLenum) { + panic!(); + } + + fn get_query_object_iv(&self, id: GLuint, pname: GLenum) -> i32 { + panic!(); + //0 + } + + fn get_query_object_uiv(&self, id: GLuint, pname: GLenum) -> u32 { + panic!(); + //0 + } + + fn get_query_object_i64v(&self, id: GLuint, pname: GLenum) -> i64 { + panic!(); + //0 + } + + fn get_query_object_ui64v(&self, id: GLuint, pname: GLenum) -> u64 { + let mut result = 0; + unsafe { + GetQueryObjectui64v(id, pname, &mut result); + } + result + } + + fn delete_queries(&self, queries: &[GLuint]) { + unsafe { + for q in queries { + DeleteQuery(*q); + } + } + } + + fn delete_vertex_arrays(&self, vertex_arrays: &[GLuint]) { + unsafe { + for v in vertex_arrays { + DeleteVertexArray(*v); + } + } + } + + fn delete_vertex_arrays_apple(&self, vertex_arrays: &[GLuint]) { + self.delete_vertex_arrays(vertex_arrays) + } + + fn delete_buffers(&self, buffers: &[GLuint]) { + unsafe { + for b in buffers { + DeleteBuffer(*b); + } + } + } + + fn delete_renderbuffers(&self, renderbuffers: &[GLuint]) { + unsafe { + for r in renderbuffers { + DeleteRenderbuffer(*r); + } + } + } + + fn delete_framebuffers(&self, framebuffers: &[GLuint]) { + unsafe { + for f in framebuffers { + DeleteFramebuffer(*f); + } + } + } + + fn delete_textures(&self, textures: &[GLuint]) { + unsafe { + for t in textures { + DeleteTexture(*t); + } + } + } + + fn framebuffer_renderbuffer( + &self, + target: GLenum, + attachment: GLenum, + renderbuffertarget: GLenum, + renderbuffer: GLuint, + ) { + debug!( + "framebufer_renderbuffer {} {} {} {}", + target, attachment, renderbuffertarget, renderbuffer + ); + //panic!(); + unsafe { + FramebufferRenderbuffer(target, attachment, renderbuffertarget, renderbuffer); + } + } + + fn renderbuffer_storage( + &self, + target: GLenum, + internalformat: GLenum, + width: GLsizei, + height: GLsizei, + ) { + debug!( + "renderbuffer_storage {} {} {} {}", + target, internalformat, width, height + ); + //panic!(); + unsafe { + RenderbufferStorage(target, internalformat, width, height); + } + } + + fn depth_func(&self, func: GLenum) { + debug!("depth_func {}", func); + //panic!(); + unsafe { + DepthFunc(func); + } + } + + fn active_texture(&self, texture: GLenum) { + //panic!(); + unsafe { + ActiveTexture(texture); + } + } + + fn attach_shader(&self, program: GLuint, shader: GLuint) { + debug!("attach shader {} {}", program, shader); + //panic!(); + unsafe { + AttachShader(program, shader); + } + } + + fn bind_attrib_location(&self, program: GLuint, index: GLuint, name: &str) { + debug!("bind_attrib_location {} {} {}", program, index, name); + //panic!(); + let c_string = CString::new(name).unwrap(); + unsafe { BindAttribLocation(program, index, c_string.as_ptr()) } + } + + // https://www.khronos.org/registry/OpenGL-Refpages/es2.0/xhtml/glGetUniform.xml + unsafe fn get_uniform_iv(&self, program: GLuint, location: GLint, result: &mut [GLint]) { + panic!(); + //assert!(!result.is_empty()); + } + + // https://www.khronos.org/registry/OpenGL-Refpages/es2.0/xhtml/glGetUniform.xml + unsafe fn get_uniform_fv(&self, program: GLuint, location: GLint, result: &mut [GLfloat]) { + panic!(); + //assert!(!result.is_empty()); + } + + fn get_uniform_block_index(&self, program: GLuint, name: &str) -> GLuint { + panic!(); + //0 + } + + fn get_uniform_indices(&self, program: GLuint, names: &[&str]) -> Vec { + panic!(); + //Vec::new() + } + + fn bind_buffer_base(&self, target: GLenum, index: GLuint, buffer: GLuint) { + panic!(); + } + + fn bind_buffer_range( + &self, + target: GLenum, + index: GLuint, + buffer: GLuint, + offset: GLintptr, + size: GLsizeiptr, + ) { + panic!(); + } + + fn uniform_block_binding( + &self, + program: GLuint, + uniform_block_index: GLuint, + uniform_block_binding: GLuint, + ) { + panic!(); + } + + fn bind_buffer(&self, target: GLenum, buffer: GLuint) { + //panic!(); + unsafe { + BindBuffer(target, buffer); + } + } + + fn bind_vertex_array(&self, vao: GLuint) { + //panic!(); + unsafe { + BindVertexArray(vao); + } + } + + fn bind_vertex_array_apple(&self, vao: GLuint) { + self.bind_vertex_array(vao) + } + + fn bind_renderbuffer(&self, target: GLenum, renderbuffer: GLuint) { + debug!("bind_renderbuffer {} {}", target, renderbuffer); + //panic!(); + unsafe { + BindRenderbuffer(target, renderbuffer); + } + } + + fn bind_framebuffer(&self, target: GLenum, framebuffer: GLuint) { + debug!("bind_framebuffer {} {}", target, framebuffer); + //panic!(); + unsafe { + BindFramebuffer(target, framebuffer); + } + } + + fn bind_vertex_buffer( + &self, + binding_index: GLuint, + buffer: GLuint, + offset: GLintptr, + stride: GLint, + ) { + unimplemented!("Not supported by SWGL"); + } + + fn bind_texture(&self, target: GLenum, texture: GLuint) { + //panic!(); + unsafe { + BindTexture(target, texture); + } + } + + fn draw_buffers(&self, bufs: &[GLenum]) { + panic!(); + //unsafe {} + } + + // FIXME: Does not verify buffer size -- unsafe! + fn tex_image_2d( + &self, + target: GLenum, + level: GLint, + internal_format: GLint, + width: GLsizei, + height: GLsizei, + border: GLint, + format: GLenum, + ty: GLenum, + opt_data: Option<&[u8]>, + ) { + unsafe { + let pdata = match opt_data { + Some(data) => data.as_ptr() as *const GLvoid, + None => ptr::null(), + }; + TexImage2D( + target, + level, + internal_format, + width, + height, + border, + format, + ty, + pdata, + ); + } + } + + fn compressed_tex_image_2d( + &self, + target: GLenum, + level: GLint, + internal_format: GLenum, + width: GLsizei, + height: GLsizei, + border: GLint, + data: &[u8], + ) { + panic!(); + } + + fn compressed_tex_sub_image_2d( + &self, + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + width: GLsizei, + height: GLsizei, + format: GLenum, + data: &[u8], + ) { + panic!(); + } + + fn tex_image_3d( + &self, + target: GLenum, + level: GLint, + internal_format: GLint, + width: GLsizei, + height: GLsizei, + depth: GLsizei, + border: GLint, + format: GLenum, + ty: GLenum, + opt_data: Option<&[u8]>, + ) { + panic!(); + } + + fn copy_tex_image_2d( + &self, + target: GLenum, + level: GLint, + internal_format: GLenum, + x: GLint, + y: GLint, + width: GLsizei, + height: GLsizei, + border: GLint, + ) { + panic!(); + } + + fn copy_tex_sub_image_2d( + &self, + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + x: GLint, + y: GLint, + width: GLsizei, + height: GLsizei, + ) { + unsafe { + CopyTexSubImage2D(target, level, xoffset, yoffset, x, y, width, height); + } + } + + fn copy_tex_sub_image_3d( + &self, + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + zoffset: GLint, + x: GLint, + y: GLint, + width: GLsizei, + height: GLsizei, + ) { + panic!(); + } + + fn tex_sub_image_2d( + &self, + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + width: GLsizei, + height: GLsizei, + format: GLenum, + ty: GLenum, + data: &[u8], + ) { + debug!( + "tex_sub_image_2d {} {} {} {} {} {} {} {}", + target, level, xoffset, yoffset, width, height, format, ty + ); + //panic!(); + unsafe { + TexSubImage2D( + target, + level, + xoffset, + yoffset, + width, + height, + format, + ty, + data.as_ptr() as *const c_void, + ); + } + } + + fn tex_sub_image_2d_pbo( + &self, + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + width: GLsizei, + height: GLsizei, + format: GLenum, + ty: GLenum, + offset: usize, + ) { + debug!( + "tex_sub_image_2d_pbo {} {} {} {} {} {} {} {} {}", + target, level, xoffset, yoffset, width, height, format, ty, offset + ); + //panic!(); + unsafe { + TexSubImage2D( + target, + level, + xoffset, + yoffset, + width, + height, + format, + ty, + offset as *const c_void, + ); + } + } + + fn tex_sub_image_3d( + &self, + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + zoffset: GLint, + width: GLsizei, + height: GLsizei, + depth: GLsizei, + format: GLenum, + ty: GLenum, + data: &[u8], + ) { + debug!("tex_sub_image_3d"); + panic!(); + } + + fn tex_sub_image_3d_pbo( + &self, + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + zoffset: GLint, + width: GLsizei, + height: GLsizei, + depth: GLsizei, + format: GLenum, + ty: GLenum, + offset: usize, + ) { + panic!(); + } + + fn tex_storage_2d( + &self, + target: GLenum, + levels: GLint, + internal_format: GLenum, + width: GLsizei, + height: GLsizei, + ) { + //panic!(); + unsafe { + TexStorage2D(target, levels, internal_format, width, height); + } + } + + fn tex_storage_3d( + &self, + target: GLenum, + levels: GLint, + internal_format: GLenum, + width: GLsizei, + height: GLsizei, + depth: GLsizei, + ) { + panic!(); + } + + fn get_tex_image_into_buffer( + &self, + target: GLenum, + level: GLint, + format: GLenum, + ty: GLenum, + output: &mut [u8], + ) { + panic!(); + } + + unsafe fn copy_image_sub_data( + &self, + src_name: GLuint, + src_target: GLenum, + src_level: GLint, + src_x: GLint, + src_y: GLint, + src_z: GLint, + dst_name: GLuint, + dst_target: GLenum, + dst_level: GLint, + dst_x: GLint, + dst_y: GLint, + dst_z: GLint, + src_width: GLsizei, + src_height: GLsizei, + src_depth: GLsizei, + ) { + CopyImageSubData( + src_name, src_target, src_level, src_x, src_y, src_z, dst_name, dst_target, dst_level, + dst_x, dst_y, dst_z, src_width, src_height, src_depth, + ); + } + + fn invalidate_framebuffer(&self, target: GLenum, attachments: &[GLenum]) { + unsafe { + InvalidateFramebuffer(target, attachments.len() as GLsizei, attachments.as_ptr()); + } + } + + fn invalidate_sub_framebuffer( + &self, + target: GLenum, + attachments: &[GLenum], + xoffset: GLint, + yoffset: GLint, + width: GLsizei, + height: GLsizei, + ) { + } + + #[inline] + unsafe fn get_integer_v(&self, name: GLenum, result: &mut [GLint]) { + //panic!(); + assert!(!result.is_empty()); + GetIntegerv(name, result.as_mut_ptr()); + } + + #[inline] + unsafe fn get_integer_64v(&self, name: GLenum, result: &mut [GLint64]) { + panic!(); + //assert!(!result.is_empty()); + } + + #[inline] + unsafe fn get_integer_iv(&self, name: GLenum, index: GLuint, result: &mut [GLint]) { + panic!(); + //assert!(!result.is_empty()); + } + + #[inline] + unsafe fn get_integer_64iv(&self, name: GLenum, index: GLuint, result: &mut [GLint64]) { + panic!(); + //assert!(!result.is_empty()); + } + + #[inline] + unsafe fn get_boolean_v(&self, name: GLenum, result: &mut [GLboolean]) { + debug!("get_boolean_v {}", name); + //panic!(); + assert!(!result.is_empty()); + GetBooleanv(name, result.as_mut_ptr()); + } + + #[inline] + unsafe fn get_float_v(&self, name: GLenum, result: &mut [GLfloat]) { + panic!(); + //assert!(!result.is_empty()); + } + + fn get_framebuffer_attachment_parameter_iv( + &self, + target: GLenum, + attachment: GLenum, + pname: GLenum, + ) -> GLint { + panic!(); + //0 + } + + fn get_renderbuffer_parameter_iv(&self, target: GLenum, pname: GLenum) -> GLint { + panic!(); + //0 + } + + fn get_tex_parameter_iv(&self, target: GLenum, pname: GLenum) -> GLint { + panic!(); + //0 + } + + fn get_tex_parameter_fv(&self, target: GLenum, pname: GLenum) -> GLfloat { + panic!(); + //0.0 + } + + fn tex_parameter_i(&self, target: GLenum, pname: GLenum, param: GLint) { + //panic!(); + unsafe { + TexParameteri(target, pname, param); + } + } + + fn tex_parameter_f(&self, target: GLenum, pname: GLenum, param: GLfloat) { + panic!(); + } + + fn framebuffer_texture_2d( + &self, + target: GLenum, + attachment: GLenum, + textarget: GLenum, + texture: GLuint, + level: GLint, + ) { + debug!( + "framebuffer_texture_2d {} {} {} {} {}", + target, attachment, textarget, texture, level + ); + //panic!(); + unsafe { + FramebufferTexture2D(target, attachment, textarget, texture, level); + } + } + + fn framebuffer_texture_layer( + &self, + target: GLenum, + attachment: GLenum, + texture: GLuint, + level: GLint, + layer: GLint, + ) { + debug!( + "framebuffer_texture_layer {} {} {} {} {}", + target, attachment, texture, level, layer + ); + panic!(); + } + + fn blit_framebuffer( + &self, + src_x0: GLint, + src_y0: GLint, + src_x1: GLint, + src_y1: GLint, + dst_x0: GLint, + dst_y0: GLint, + dst_x1: GLint, + dst_y1: GLint, + mask: GLbitfield, + filter: GLenum, + ) { + unsafe { + BlitFramebuffer( + src_x0, src_y0, src_x1, src_y1, dst_x0, dst_y0, dst_x1, dst_y1, mask, filter, + ); + } + } + + fn vertex_attrib_4f(&self, index: GLuint, x: GLfloat, y: GLfloat, z: GLfloat, w: GLfloat) { + panic!(); + } + + fn vertex_attrib_binding(&self, attrib_index: GLuint, binding_index: GLuint) { + unimplemented!("Not supported by SWGL"); + } + + fn vertex_attrib_pointer_f32( + &self, + index: GLuint, + size: GLint, + normalized: bool, + stride: GLsizei, + offset: GLuint, + ) { + panic!(); + } + + fn vertex_attrib_pointer( + &self, + index: GLuint, + size: GLint, + type_: GLenum, + normalized: bool, + stride: GLsizei, + offset: GLuint, + ) { + debug!( + "vertex_attrib_pointer {} {} {} {} {} {}", + index, size, type_, normalized, stride, offset + ); + //panic!(); + unsafe { + VertexAttribPointer( + index, + size, + type_, + normalized as GLboolean, + stride, + offset as *const GLvoid, + ); + } + } + + fn vertex_attrib_i_pointer( + &self, + index: GLuint, + size: GLint, + type_: GLenum, + stride: GLsizei, + offset: GLuint, + ) { + debug!( + "vertex_attrib_i_pointer {} {} {} {} {}", + index, size, type_, stride, offset + ); + //panic!(); + unsafe { + VertexAttribIPointer(index, size, type_, stride, offset as *const GLvoid); + } + } + + fn vertex_attrib_divisor(&self, index: GLuint, divisor: GLuint) { + debug!("vertex_attrib_divisor {} {}", index, divisor); + //assert!(index == 0 && divisor == 0); + //panic!(); + unsafe { + VertexAttribDivisor(index, divisor); + } + } + + fn vertex_attrib_format( + &self, + attrib_index: GLuint, + size: GLint, + type_: GLenum, + normalized: bool, + relative_offset: GLuint, + ) { + unimplemented!("Not supported by SWGL"); + } + + fn vertex_attrib_i_format( + &self, + attrib_index: GLuint, + size: GLint, + type_: GLenum, + relative_offset: GLuint, + ) { + unimplemented!("Not supported by SWGL"); + } + + fn vertex_binding_divisor(&self, binding_index: GLuint, divisor: GLuint) { + unimplemented!("Not supported by SWGL"); + } + + fn viewport(&self, x: GLint, y: GLint, width: GLsizei, height: GLsizei) { + debug!("viewport {} {} {} {}", x, y, width, height); + //panic!(); + unsafe { + SetViewport(x, y, width, height); + } + } + + fn scissor(&self, x: GLint, y: GLint, width: GLsizei, height: GLsizei) { + //panic!(); + unsafe { + SetScissor(x, y, width, height); + } + } + + fn line_width(&self, width: GLfloat) { + panic!(); + } + + fn use_program(&self, program: GLuint) { + //panic!(); + unsafe { + UseProgram(program); + } + } + + fn validate_program(&self, program: GLuint) { + panic!(); + } + + fn draw_arrays(&self, mode: GLenum, first: GLint, count: GLsizei) { + unsafe { + DrawElementsInstanced(mode, count, NONE, first as GLintptr, 1); + } + } + + fn draw_arrays_instanced( + &self, + mode: GLenum, + first: GLint, + count: GLsizei, + primcount: GLsizei, + ) { + unsafe { + DrawElementsInstanced(mode, count, NONE, first as GLintptr, primcount); + } + } + + fn draw_elements( + &self, + mode: GLenum, + count: GLsizei, + element_type: GLenum, + indices_offset: GLuint, + ) { + debug!( + "draw_elements {} {} {} {} {}", + mode, count, element_type, indices_offset + ); + //panic!(); + unsafe { + DrawElementsInstanced(mode, count, element_type, indices_offset as GLintptr, 1); + } + } + + fn draw_elements_instanced( + &self, + mode: GLenum, + count: GLsizei, + element_type: GLenum, + indices_offset: GLuint, + primcount: GLsizei, + ) { + debug!( + "draw_elements_instanced {} {} {} {} {}", + mode, count, element_type, indices_offset, primcount + ); + //panic!(); + unsafe { + DrawElementsInstanced( + mode, + count, + element_type, + indices_offset as GLintptr, + primcount, + ); + } + } + + fn blend_color(&self, r: f32, g: f32, b: f32, a: f32) { + unsafe { + BlendColor(r, g, b, a); + } + } + + fn blend_func(&self, sfactor: GLenum, dfactor: GLenum) { + unsafe { + BlendFunc(sfactor, dfactor, sfactor, dfactor); + } + } + + fn blend_func_separate( + &self, + src_rgb: GLenum, + dest_rgb: GLenum, + src_alpha: GLenum, + dest_alpha: GLenum, + ) { + unsafe { + BlendFunc(src_rgb, dest_rgb, src_alpha, dest_alpha); + } + } + + fn blend_equation(&self, mode: GLenum) { + unsafe { + BlendEquation(mode); + } + } + + fn blend_equation_separate(&self, mode_rgb: GLenum, mode_alpha: GLenum) { + panic!(); + } + + fn color_mask(&self, r: bool, g: bool, b: bool, a: bool) { + panic!(); + } + + fn cull_face(&self, mode: GLenum) { + panic!(); + } + + fn front_face(&self, mode: GLenum) { + panic!(); + } + + fn enable(&self, cap: GLenum) { + debug!("enable {}", cap); + //panic!(); + unsafe { + Enable(cap); + } + } + + fn disable(&self, cap: GLenum) { + debug!("disable {}", cap); + //panic!(); + unsafe { + Disable(cap); + } + } + + fn hint(&self, param_name: GLenum, param_val: GLenum) { + panic!(); + } + + fn is_enabled(&self, cap: GLenum) -> GLboolean { + panic!(); + //0 + } + + fn is_shader(&self, shader: GLuint) -> GLboolean { + panic!(); + //0 + } + + fn is_texture(&self, texture: GLenum) -> GLboolean { + panic!(); + //0 + } + + fn is_framebuffer(&self, framebuffer: GLenum) -> GLboolean { + panic!(); + //0 + } + + fn is_renderbuffer(&self, renderbuffer: GLenum) -> GLboolean { + panic!(); + //0 + } + + fn check_frame_buffer_status(&self, target: GLenum) -> GLenum { + debug!("check_frame_buffer_status {}", target); + //panic!(); + unsafe { CheckFramebufferStatus(target) } + } + + fn enable_vertex_attrib_array(&self, index: GLuint) { + //panic!(); + debug!("enable_vertex_attrib_array {}", index); + unsafe { + EnableVertexAttribArray(index); + //assert_eq!(index, 0); + } + } + + fn disable_vertex_attrib_array(&self, index: GLuint) { + panic!(); + } + + fn uniform_1f(&self, location: GLint, v0: GLfloat) { + panic!(); + } + + fn uniform_1fv(&self, location: GLint, values: &[f32]) { + panic!(); + } + + fn uniform_1i(&self, location: GLint, v0: GLint) { + debug!("uniform_1i {} {}", location, v0); + //panic!(); + unsafe { + Uniform1i(location, v0); + } + } + + fn uniform_1iv(&self, location: GLint, values: &[i32]) { + panic!(); + } + + fn uniform_1ui(&self, location: GLint, v0: GLuint) { + panic!(); + } + + fn uniform_2f(&self, location: GLint, v0: GLfloat, v1: GLfloat) { + panic!(); + } + + fn uniform_2fv(&self, location: GLint, values: &[f32]) { + panic!(); + } + + fn uniform_2i(&self, location: GLint, v0: GLint, v1: GLint) { + panic!(); + } + + fn uniform_2iv(&self, location: GLint, values: &[i32]) { + panic!(); + } + + fn uniform_2ui(&self, location: GLint, v0: GLuint, v1: GLuint) { + panic!(); + } + + fn uniform_3f(&self, location: GLint, v0: GLfloat, v1: GLfloat, v2: GLfloat) { + panic!(); + } + + fn uniform_3fv(&self, location: GLint, values: &[f32]) { + panic!(); + } + + fn uniform_3i(&self, location: GLint, v0: GLint, v1: GLint, v2: GLint) { + panic!(); + } + + fn uniform_3iv(&self, location: GLint, values: &[i32]) { + panic!(); + } + + fn uniform_3ui(&self, location: GLint, v0: GLuint, v1: GLuint, v2: GLuint) { + panic!(); + } + + fn uniform_4f(&self, location: GLint, x: GLfloat, y: GLfloat, z: GLfloat, w: GLfloat) { + panic!(); + } + + fn uniform_4i(&self, location: GLint, x: GLint, y: GLint, z: GLint, w: GLint) { + panic!(); + } + + fn uniform_4iv(&self, location: GLint, values: &[i32]) { + panic!(); + } + + fn uniform_4ui(&self, location: GLint, x: GLuint, y: GLuint, z: GLuint, w: GLuint) { + panic!(); + } + + fn uniform_4fv(&self, location: GLint, values: &[f32]) { + unsafe { + Uniform4fv(location, (values.len() / 4) as GLsizei, values.as_ptr()); + } + } + + fn uniform_matrix_2fv(&self, location: GLint, transpose: bool, value: &[f32]) { + panic!(); + } + + fn uniform_matrix_3fv(&self, location: GLint, transpose: bool, value: &[f32]) { + panic!(); + } + + fn uniform_matrix_4fv(&self, location: GLint, transpose: bool, value: &[f32]) { + debug!("uniform_matrix_4fv {} {} {:?}", location, transpose, value); + //panic!(); + unsafe { + UniformMatrix4fv( + location, + (value.len() / 16) as GLsizei, + transpose as GLboolean, + value.as_ptr(), + ); + } + } + + fn depth_mask(&self, flag: bool) { + debug!("depth_mask {}", flag); + //panic!(); + unsafe { + DepthMask(flag as GLboolean); + } + } + + fn depth_range(&self, near: f64, far: f64) { + panic!(); + } + + fn get_active_attrib(&self, program: GLuint, index: GLuint) -> (i32, u32, String) { + panic!(); + //(0, 0, String::new()) + } + + fn get_active_uniform(&self, program: GLuint, index: GLuint) -> (i32, u32, String) { + panic!(); + //(0, 0, String::new()) + } + + fn get_active_uniforms_iv( + &self, + program: GLuint, + indices: Vec, + pname: GLenum, + ) -> Vec { + panic!(); + //Vec::new() + } + + fn get_active_uniform_block_i(&self, program: GLuint, index: GLuint, pname: GLenum) -> GLint { + panic!(); + //0 + } + + fn get_active_uniform_block_iv( + &self, + program: GLuint, + index: GLuint, + pname: GLenum, + ) -> Vec { + panic!(); + //Vec::new() + } + + fn get_active_uniform_block_name(&self, program: GLuint, index: GLuint) -> String { + panic!(); + //String::new() + } + + fn get_attrib_location(&self, program: GLuint, name: &str) -> c_int { + let name = CString::new(name).unwrap(); + unsafe { GetAttribLocation(program, name.as_ptr()) } + } + + fn get_frag_data_location(&self, program: GLuint, name: &str) -> c_int { + panic!(); + //0 + } + + fn get_uniform_location(&self, program: GLuint, name: &str) -> c_int { + debug!("get_uniform_location {} {}", program, name); + //panic!(); + let name = CString::new(name).unwrap(); + unsafe { GetUniformLocation(program, name.as_ptr()) } + } + + fn get_program_info_log(&self, program: GLuint) -> String { + debug!("get_program_info_log {}", program); + String::new() + } + + #[inline] + unsafe fn get_program_iv(&self, program: GLuint, pname: GLenum, result: &mut [GLint]) { + debug!("get_program_iv {}", pname); + //panic!(); + assert!(!result.is_empty()); + //#define GL_LINK_STATUS 0x8B82 + if pname == 0x8b82 { + result[0] = GetLinkStatus(program); + } + } + + fn get_program_binary(&self, program: GLuint) -> (Vec, GLenum) { + panic!(); + //(Vec::new(), NONE) + } + + fn program_binary(&self, program: GLuint, format: GLenum, binary: &[u8]) { + panic!(); + } + + fn program_parameter_i(&self, program: GLuint, pname: GLenum, value: GLint) { + panic!(); + } + + #[inline] + unsafe fn get_vertex_attrib_iv(&self, index: GLuint, pname: GLenum, result: &mut [GLint]) { + panic!(); + //assert!(!result.is_empty()); + } + + #[inline] + unsafe fn get_vertex_attrib_fv(&self, index: GLuint, pname: GLenum, result: &mut [GLfloat]) { + panic!(); + //assert!(!result.is_empty()); + } + + fn get_vertex_attrib_pointer_v(&self, index: GLuint, pname: GLenum) -> GLsizeiptr { + panic!(); + //0 + } + + fn get_buffer_parameter_iv(&self, target: GLuint, pname: GLenum) -> GLint { + panic!(); + //0 + } + + fn get_shader_info_log(&self, shader: GLuint) -> String { + debug!("get_shader_info_log {}", shader); + //panic!(); + String::new() + } + + fn get_string(&self, which: GLenum) -> String { + // panic!(); + unsafe { + let llstr = GetString(which); + if !llstr.is_null() { + return str::from_utf8_unchecked(CStr::from_ptr(llstr).to_bytes()).to_string(); + } else { + return "".to_string(); + } + } + } + + fn get_string_i(&self, which: GLenum, index: GLuint) -> String { + //panic!(); + unsafe { + let llstr = GetStringi(which, index); + if !llstr.is_null() { + str::from_utf8_unchecked(CStr::from_ptr(llstr).to_bytes()).to_string() + } else { + "".to_string() + } + } + } + + unsafe fn get_shader_iv(&self, shader: GLuint, pname: GLenum, result: &mut [GLint]) { + debug!("get_shader_iv"); + //panic!(); + assert!(!result.is_empty()); + if pname == 0x8B81 + /*gl::COMPILE_STATUS*/ + { + result[0] = 1; + } + } + + fn get_shader_precision_format( + &self, + _shader_type: GLuint, + precision_type: GLuint, + ) -> (GLint, GLint, GLint) { + // gl.GetShaderPrecisionFormat is not available until OpenGL 4.1. + // Fallback to OpenGL standard precissions that most desktop hardware support. + match precision_type { + LOW_FLOAT | MEDIUM_FLOAT | HIGH_FLOAT => { + // Fallback to IEEE 754 single precision + // Range: from -2^127 to 2^127 + // Significand precision: 23 bits + (127, 127, 23) + } + LOW_INT | MEDIUM_INT | HIGH_INT => { + // Fallback to single precision integer + // Range: from -2^24 to 2^24 + // Precision: For integer formats this value is always 0 + (24, 24, 0) + } + _ => (0, 0, 0), + } + } + + fn compile_shader(&self, shader: GLuint) { + debug!("compile_shader {}", shader); + //panic!(); + } + + fn create_program(&self) -> GLuint { + debug!("create_program"); + //panic!(); + unsafe { CreateProgram() } + } + + fn delete_program(&self, program: GLuint) { + unsafe { + DeleteProgram(program); + } + } + + fn create_shader(&self, shader_type: GLenum) -> GLuint { + debug!("create_shader {}", shader_type); + //panic!(); + unsafe { CreateShader(shader_type) } + } + + fn delete_shader(&self, shader: GLuint) { + debug!("delete_shader {}", shader); + //panic!(); + unsafe { + DeleteShader(shader); + } + } + + fn detach_shader(&self, program: GLuint, shader: GLuint) { + debug!("detach_shader {} {}", program, shader); + //panic!(); + } + + fn link_program(&self, program: GLuint) { + debug!("link_program {}", program); + //panic!(); + unsafe { + LinkProgram(program); + } + } + + fn clear_color(&self, r: f32, g: f32, b: f32, a: f32) { + //panic!(); + unsafe { + ClearColor(r, g, b, a); + } + } + + fn clear(&self, buffer_mask: GLbitfield) { + debug!("clear {}", buffer_mask); + //panic!(); + unsafe { + Clear(buffer_mask); + } + } + + fn clear_depth(&self, depth: f64) { + debug!("clear_depth {}", depth); + //panic!(); + unsafe { + ClearDepth(depth as GLclampd); + } + } + + fn clear_stencil(&self, s: GLint) { + panic!(); + } + + fn flush(&self) {} + + fn finish(&self) { + unsafe { + Finish(); + } + } + + fn get_error(&self) -> GLenum { + //panic!(); + unsafe { GetError() } + } + + fn stencil_mask(&self, mask: GLuint) { + panic!(); + } + + fn stencil_mask_separate(&self, face: GLenum, mask: GLuint) { + panic!(); + } + + fn stencil_func(&self, func: GLenum, ref_: GLint, mask: GLuint) { + panic!(); + } + + fn stencil_func_separate(&self, face: GLenum, func: GLenum, ref_: GLint, mask: GLuint) { + panic!(); + } + + fn stencil_op(&self, sfail: GLenum, dpfail: GLenum, dppass: GLenum) { + panic!(); + } + + fn stencil_op_separate(&self, face: GLenum, sfail: GLenum, dpfail: GLenum, dppass: GLenum) { + panic!(); + } + + fn egl_image_target_texture2d_oes(&self, target: GLenum, image: GLeglImageOES) { + panic!("not supported") + } + + fn egl_image_target_renderbuffer_storage_oes(&self, target: GLenum, image: GLeglImageOES) { + panic!("not supported") + } + + fn generate_mipmap(&self, target: GLenum) { + unsafe { + GenerateMipmap(target); + } + } + + fn insert_event_marker_ext(&self, message: &str) { + panic!(); + } + + fn push_group_marker_ext(&self, message: &str) { + debug!("push group {}", message); + panic!(); + } + + fn pop_group_marker_ext(&self) { + debug!("pop group"); + panic!(); + } + + fn debug_message_insert_khr( + &self, + source: GLenum, + type_: GLenum, + id: GLuint, + severity: GLenum, + message: &str, + ) { + panic!(); + } + + fn push_debug_group_khr(&self, source: GLenum, id: GLuint, message: &str) { + panic!(); + } + + fn pop_debug_group_khr(&self) { + panic!(); + } + + fn fence_sync(&self, condition: GLenum, flags: GLbitfield) -> GLsync { + panic!(); + //ptr::null() + } + + fn client_wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) -> GLenum { + panic!(); + } + + fn wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) { + panic!(); + } + + fn texture_range_apple(&self, target: GLenum, data: &[u8]) { + panic!(); + } + + fn delete_sync(&self, sync: GLsync) { + panic!(); + } + + fn gen_fences_apple(&self, n: GLsizei) -> Vec { + panic!(); + //Vec::new() + } + + fn delete_fences_apple(&self, fences: &[GLuint]) { + panic!(); + } + + fn set_fence_apple(&self, fence: GLuint) { + panic!(); + } + + fn finish_fence_apple(&self, fence: GLuint) { + panic!(); + } + + fn test_fence_apple(&self, fence: GLuint) { + panic!(); + } + + fn test_object_apple(&self, object: GLenum, name: GLuint) -> GLboolean { + panic!(); + //0 + } + + fn finish_object_apple(&self, object: GLenum, name: GLuint) { + panic!(); + } + + // GL_ARB_blend_func_extended + fn bind_frag_data_location_indexed( + &self, + program: GLuint, + color_number: GLuint, + index: GLuint, + name: &str, + ) { + panic!(); + } + + fn get_frag_data_index(&self, program: GLuint, name: &str) -> GLint { + panic!(); + //-1 + } + + // GL_KHR_debug + fn get_debug_messages(&self) -> Vec { + Vec::new() + } + + fn provoking_vertex_angle(&self, _mode: GLenum) { + unimplemented!("This extension is GLES only"); + } + + // GL_KHR_blend_equation_advanced + fn blend_barrier_khr(&self) { + // No barrier required, so nothing to do + } + + // GL_CHROMIUM_copy_texture + fn copy_texture_chromium( + &self, + _source_id: GLuint, + _source_level: GLint, + _dest_target: GLenum, + _dest_id: GLuint, + _dest_level: GLint, + _internal_format: GLint, + _dest_type: GLenum, + _unpack_flip_y: GLboolean, + _unpack_premultiply_alpha: GLboolean, + _unpack_unmultiply_alpha: GLboolean, + ) { + unimplemented!("This extension is GLES only"); + } + fn copy_sub_texture_chromium( + &self, + _source_id: GLuint, + _source_level: GLint, + _dest_target: GLenum, + _dest_id: GLuint, + _dest_level: GLint, + _x_offset: GLint, + _y_offset: GLint, + _x: GLint, + _y: GLint, + _width: GLsizei, + _height: GLsizei, + _unpack_flip_y: GLboolean, + _unpack_premultiply_alpha: GLboolean, + _unpack_unmultiply_alpha: GLboolean, + ) { + unimplemented!("This extension is GLES only"); + } + + // GL_ANGLE_copy_texture_3d + fn copy_texture_3d_angle( + &self, + _source_id: GLuint, + _source_level: GLint, + _dest_target: GLenum, + _dest_id: GLuint, + _dest_level: GLint, + _internal_format: GLint, + _dest_type: GLenum, + _unpack_flip_y: GLboolean, + _unpack_premultiply_alpha: GLboolean, + _unpack_unmultiply_alpha: GLboolean, + ) { + unimplemented!("Not supported by SWGL"); + } + + fn copy_sub_texture_3d_angle( + &self, + _source_id: GLuint, + _source_level: GLint, + _dest_target: GLenum, + _dest_id: GLuint, + _dest_level: GLint, + _x_offset: GLint, + _y_offset: GLint, + _z_offset: GLint, + _x: GLint, + _y: GLint, + _z: GLint, + _width: GLsizei, + _height: GLsizei, + _depth: GLsizei, + _unpack_flip_y: GLboolean, + _unpack_premultiply_alpha: GLboolean, + _unpack_unmultiply_alpha: GLboolean, + ) { + unimplemented!("Not supported by SWGL"); + } + + fn buffer_storage( + &self, + target: GLenum, + size: GLsizeiptr, + data: *const GLvoid, + flags: GLbitfield, + ) { + unimplemented!("Not supported by SWGL"); + } + + fn flush_mapped_buffer_range(&self, target: GLenum, offset: GLintptr, length: GLsizeiptr) { + unimplemented!("Not supported by SWGL"); + } + + fn start_tiling_qcom( + &self, + x: GLuint, + y: GLuint, + width: GLuint, + height: GLuint, + preserve_mask: GLbitfield, + ) { + unimplemented!("Not supported by SWGL"); + } + + fn end_tiling_qcom(&self, preserve_mask: GLbitfield) { + unimplemented!("Not supported by SWGL"); + } +} + +/// A resource that is intended for sharing between threads. +/// Locked resources such as textures or framebuffers will +/// not allow any further modifications while it remains +/// locked. The resource will be unlocked when LockedResource +/// is dropped. +pub struct LockedResource(*mut LockedTexture); + +unsafe impl Send for LockedResource {} +unsafe impl Sync for LockedResource {} + +#[repr(u8)] +pub enum YuvRangedColorSpace { + Rec601Narrow = 0, + Rec601Full, + Rec709Narrow, + Rec709Full, + Rec2020Narrow, + Rec2020Full, + GbrIdentity, +} + +impl LockedResource { + /// Composites from a locked resource to another locked resource. The band + /// offset and height are relative to the destination rectangle and specify + /// how to clip the composition into appropriate range for this band. + pub fn composite( + &self, + locked_src: &LockedResource, + src_x: GLint, + src_y: GLint, + src_width: GLsizei, + src_height: GLsizei, + dst_x: GLint, + dst_y: GLint, + dst_width: GLsizei, + dst_height: GLsizei, + opaque: bool, + flip_x: bool, + flip_y: bool, + filter: GLenum, + clip_x: GLint, + clip_y: GLint, + clip_width: GLsizei, + clip_height: GLsizei, + ) { + unsafe { + Composite( + self.0, + locked_src.0, + src_x, + src_y, + src_width, + src_height, + dst_x, + dst_y, + dst_width, + dst_height, + opaque as GLboolean, + flip_x as GLboolean, + flip_y as GLboolean, + filter, + clip_x, + clip_y, + clip_width, + clip_height, + ); + } + } + + /// Composites from locked resources representing YUV planes + pub fn composite_yuv( + &self, + locked_y: &LockedResource, + locked_u: &LockedResource, + locked_v: &LockedResource, + color_space: YuvRangedColorSpace, + color_depth: GLuint, + src_x: GLint, + src_y: GLint, + src_width: GLsizei, + src_height: GLsizei, + dst_x: GLint, + dst_y: GLint, + dst_width: GLsizei, + dst_height: GLsizei, + flip_x: bool, + flip_y: bool, + clip_x: GLint, + clip_y: GLint, + clip_width: GLsizei, + clip_height: GLsizei, + ) { + unsafe { + CompositeYUV( + self.0, + locked_y.0, + locked_u.0, + locked_v.0, + color_space, + color_depth, + src_x, + src_y, + src_width, + src_height, + dst_x, + dst_y, + dst_width, + dst_height, + flip_x as GLboolean, + flip_y as GLboolean, + clip_x, + clip_y, + clip_width, + clip_height, + ); + } + } + + /// Get the underlying buffer for a locked resource + pub fn get_buffer(&self) -> (*mut c_void, i32, i32, i32) { + unsafe { + let mut width: i32 = 0; + let mut height: i32 = 0; + let mut stride: i32 = 0; + let data_ptr = GetResourceBuffer(self.0, &mut width, &mut height, &mut stride); + (data_ptr, width, height, stride) + } + } +} + +impl Clone for LockedResource { + fn clone(&self) -> Self { + unsafe { + LockResource(self.0); + } + LockedResource(self.0) + } +} + +impl Drop for LockedResource { + fn drop(&mut self) { + unsafe { + UnlockResource(self.0); + } + } +} diff --git a/gfx/wr/swgl/src/texture.h b/gfx/wr/swgl/src/texture.h new file mode 100644 index 0000000000..15d0b1fd6a --- /dev/null +++ b/gfx/wr/swgl/src/texture.h @@ -0,0 +1,1310 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +namespace glsl { + +using PackedRGBA8 = V16; +using WideRGBA8 = V16; +using HalfRGBA8 = V8; + +SI WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); } + +template +UNUSED SI VectorType genericPackWide(VectorType p) { + typedef VectorType packed_type; + // Generic conversions only mask off the low byte without actually clamping + // like a real pack. First force the word to all 1s if it overflows, and then + // add on the sign bit to cause it to roll over to 0 if it was negative. + p = (p | (p > 255)) + (p >> 15); + return CONVERT(p, packed_type); +} + +SI PackedRGBA8 pack(WideRGBA8 p) { +#if USE_SSE2 + return _mm_packus_epi16(lowHalf(p), highHalf(p)); +#elif USE_NEON + return vcombine_u8(vqmovun_s16(bit_cast>(lowHalf(p))), + vqmovun_s16(bit_cast>(highHalf(p)))); +#else + return genericPackWide(p); +#endif +} + +using PackedR8 = V4; +using WideR8 = V4; + +SI WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); } + +SI PackedR8 pack(WideR8 p) { +#if USE_SSE2 + auto m = expand(p); + auto r = bit_cast>(_mm_packus_epi16(m, m)); + return SHUFFLE(r, r, 0, 1, 2, 3); +#elif USE_NEON + return lowHalf( + bit_cast>(vqmovun_s16(bit_cast>(expand(p))))); +#else + return genericPackWide(p); +#endif +} + +using PackedRG8 = V8; +using WideRG8 = V8; + +SI PackedRG8 pack(WideRG8 p) { +#if USE_SSE2 + return lowHalf(bit_cast>(_mm_packus_epi16(p, p))); +#elif USE_NEON + return bit_cast>(vqmovun_s16(bit_cast>(p))); +#else + return genericPackWide(p); +#endif +} + +SI I32 clampCoord(I32 coord, int limit, int base = 0) { +#if USE_SSE2 + return _mm_min_epi16(_mm_max_epi16(coord, _mm_set1_epi32(base)), + _mm_set1_epi32(limit - 1)); +#else + return clamp(coord, base, limit - 1); +#endif +} + +SI int clampCoord(int coord, int limit, int base = 0) { + return min(max(coord, base), limit - 1); +} + +template +SI T clamp2D(T P, S sampler) { + return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)}; +} + +SI float to_float(uint32_t x) { return x * (1.f / 255.f); } + +SI vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + U32 pixels = {a, b, c, d}; + return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF), + cast(pixels & 0xFF), cast(pixels >> 24)) * + (1.0f / 255.0f); +} + +SI vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) { + return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y}, + Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w}); +} + +SI ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) { + return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y}, + I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w}); +} + +SI vec4_scalar pixel_to_vec4(uint32_t p) { + U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24}; + Float f = cast(i) * (1.0f / 255.0f); + return vec4_scalar(f.x, f.y, f.z, f.w); +} + +template +SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) { + return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y], + sampler->buf[offset.z], sampler->buf[offset.w]); +} + +template +vec4 texelFetchRGBA8(S sampler, ivec2 P) { + I32 offset = P.x + P.y * sampler->stride; + return fetchOffsetsRGBA8(sampler, offset); +} + +template +SI Float fetchOffsetsR8(S sampler, I32 offset) { + U32 i = { + ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y], + ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]}; + return cast(i) * (1.0f / 255.0f); +} + +template +vec4 texelFetchR8(S sampler, ivec2 P) { + I32 offset = P.x + P.y * sampler->stride; + return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f); +} + +template +SI vec4 fetchOffsetsRG8(S sampler, I32 offset) { + uint16_t* buf = (uint16_t*)sampler->buf; + U16 pixels = {buf[offset.x], buf[offset.y], buf[offset.z], buf[offset.w]}; + Float r = CONVERT(pixels & 0xFF, Float) * (1.0f / 255.0f); + Float g = CONVERT(pixels >> 8, Float) * (1.0f / 255.0f); + return vec4(r, g, 0.0f, 1.0f); +} + +template +vec4 texelFetchRG8(S sampler, ivec2 P) { + I32 offset = P.x + P.y * sampler->stride; + return fetchOffsetsRG8(sampler, offset); +} + +template +SI Float fetchOffsetsR16(S sampler, I32 offset) { + U32 i = { + ((uint16_t*)sampler->buf)[offset.x], ((uint16_t*)sampler->buf)[offset.y], + ((uint16_t*)sampler->buf)[offset.z], ((uint16_t*)sampler->buf)[offset.w]}; + return cast(i) * (1.0f / 65535.0f); +} + +template +vec4 texelFetchR16(S sampler, ivec2 P) { + I32 offset = P.x + P.y * sampler->stride; + return vec4(fetchOffsetsR16(sampler, offset), 0.0f, 0.0f, 1.0f); +} + +template +SI vec4 fetchOffsetsRG16(S sampler, I32 offset) { + U32 pixels = {sampler->buf[offset.x], sampler->buf[offset.y], + sampler->buf[offset.z], sampler->buf[offset.w]}; + Float r = cast(pixels & 0xFFFF) * (1.0f / 65535.0f); + Float g = cast(pixels >> 16) * (1.0f / 65535.0f); + return vec4(r, g, 0.0f, 1.0f); +} + +template +vec4 texelFetchRG16(S sampler, ivec2 P) { + I32 offset = P.x + P.y * sampler->stride; + return fetchOffsetsRG16(sampler, offset); +} + +SI vec4 fetchOffsetsFloat(const uint32_t* buf, I32 offset) { + return pixel_float_to_vec4(*(Float*)&buf[offset.x], *(Float*)&buf[offset.y], + *(Float*)&buf[offset.z], *(Float*)&buf[offset.w]); +} + +SI vec4 fetchOffsetsFloat(samplerCommon* sampler, I32 offset) { + return fetchOffsetsFloat(sampler->buf, offset); +} + +vec4 texelFetchFloat(sampler2D sampler, ivec2 P) { + I32 offset = P.x * 4 + P.y * sampler->stride; + return fetchOffsetsFloat(sampler, offset); +} + +template +SI vec4 fetchOffsetsYUV422(S sampler, I32 offset) { + // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R. + // Offset is aligned to a chunk rather than a pixel, and selector specifies + // pixel within the chunk. + I32 selector = offset & 1; + offset &= ~1; + uint16_t* buf = (uint16_t*)sampler->buf; + U32 pixels = {*(uint32_t*)&buf[offset.x], *(uint32_t*)&buf[offset.y], + *(uint32_t*)&buf[offset.z], *(uint32_t*)&buf[offset.w]}; + Float b = CONVERT((pixels >> 8) & 0xFF, Float) * (1.0f / 255.0f); + Float r = CONVERT((pixels >> 24), Float) * (1.0f / 255.0f); + Float g = + CONVERT(if_then_else(-selector, pixels >> 16, pixels) & 0xFF, Float) * + (1.0f / 255.0f); + return vec4(r, g, b, 1.0f); +} + +template +vec4 texelFetchYUV422(S sampler, ivec2 P) { + I32 offset = P.x + P.y * sampler->stride; + return fetchOffsetsYUV422(sampler, offset); +} + +vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + switch (sampler->format) { + case TextureFormat::RGBA32F: + return texelFetchFloat(sampler, P); + case TextureFormat::RGBA8: + return texelFetchRGBA8(sampler, P); + case TextureFormat::R8: + return texelFetchR8(sampler, P); + case TextureFormat::RG8: + return texelFetchRG8(sampler, P); + case TextureFormat::R16: + return texelFetchR16(sampler, P); + case TextureFormat::RG16: + return texelFetchRG16(sampler, P); + case TextureFormat::YUV422: + return texelFetchYUV422(sampler, P); + default: + assert(false); + return vec4(); + } +} + +vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA32F); + return texelFetchFloat(sampler, P); +} + +vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA8); + return texelFetchRGBA8(sampler, P); +} + +vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::R8); + return texelFetchR8(sampler, P); +} + +vec4 texelFetch(sampler2DRG8 sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RG8); + return texelFetchRG8(sampler, P); +} + +vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + if (sampler->format == TextureFormat::RGBA32F) { + return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; + } else { + assert(sampler->format == TextureFormat::RGBA8); + return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]); + } +} + +vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA32F); + return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; +} + +vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA8); + return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]); +} + +vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::R8); + return vec4_scalar{ + to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f, + 0.0f, 1.0f}; +} + +vec4_scalar texelFetch(sampler2DRG8 sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RG8); + uint16_t pixel = ((uint16_t*)sampler->buf)[P.x + P.y * sampler->stride]; + return vec4_scalar{to_float(pixel & 0xFF), to_float(pixel >> 8), 0.0f, 1.0f}; +} + +vec4 texelFetch(sampler2DRect sampler, ivec2 P) { + P = clamp2D(P, sampler); + switch (sampler->format) { + case TextureFormat::RGBA8: + return texelFetchRGBA8(sampler, P); + case TextureFormat::R8: + return texelFetchR8(sampler, P); + case TextureFormat::RG8: + return texelFetchRG8(sampler, P); + case TextureFormat::R16: + return texelFetchR16(sampler, P); + case TextureFormat::RG16: + return texelFetchRG16(sampler, P); + case TextureFormat::YUV422: + return texelFetchYUV422(sampler, P); + default: + assert(false); + return vec4(); + } +} + +SI ivec4 fetchOffsetsInt(const uint32_t* buf, I32 offset) { + return pixel_int_to_ivec4(*(I32*)&buf[offset.x], *(I32*)&buf[offset.y], + *(I32*)&buf[offset.z], *(I32*)&buf[offset.w]); +} + +SI ivec4 fetchOffsetsInt(samplerCommon* sampler, I32 offset) { + return fetchOffsetsInt(sampler->buf, offset); +} + +ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA32I); + I32 offset = P.x * 4 + P.y * sampler->stride; + return fetchOffsetsInt(sampler, offset); +} + +ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA32I); + return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; +} + +constexpr int MAX_TEXEL_OFFSET = 8; + +// Fill texelFetchOffset outside the valid texture bounds with zeroes. The +// stride will be set to 0 so that only one row of zeroes is needed. +static const uint32_t + zeroFetchBuf[MAX_TEXEL_OFFSET * sizeof(Float) / sizeof(uint32_t)] = {0}; + +struct FetchScalar { + const uint32_t* buf; + uint32_t stride; +}; + +template +SI FetchScalar texelFetchPtr(S sampler, ivec2_scalar P, int min_x, int max_x, + int min_y, int max_y) { + assert(max_x < MAX_TEXEL_OFFSET); + if (P.x < -min_x || P.x >= int(sampler->width) - max_x || P.y < -min_y || + P.y >= int(sampler->height) - max_y) { + return FetchScalar{zeroFetchBuf, 0}; + } + return FetchScalar{&sampler->buf[P.x * 4 + P.y * sampler->stride], + sampler->stride}; +} + +SI vec4_scalar texelFetchUnchecked(sampler2D sampler, FetchScalar ptr, int x, + int y = 0) { + assert(sampler->format == TextureFormat::RGBA32F); + return *(vec4_scalar*)&ptr.buf[x * 4 + y * ptr.stride]; +} + +SI ivec4_scalar texelFetchUnchecked(isampler2D sampler, FetchScalar ptr, int x, + int y = 0) { + assert(sampler->format == TextureFormat::RGBA32I); + return *(ivec4_scalar*)&ptr.buf[x * 4 + y * ptr.stride]; +} + +struct FetchVector { + const uint32_t* buf; + I32 offset; + uint32_t stride; +}; + +template +SI FetchVector texelFetchPtr(S sampler, ivec2 P, int min_x, int max_x, + int min_y, int max_y) { + assert(max_x < MAX_TEXEL_OFFSET); + if (test_any(P.x < -min_x || P.x >= int(sampler->width) - max_x || + P.y < -min_y || P.y >= int(sampler->height) - max_y)) { + return FetchVector{zeroFetchBuf, I32(0), 0}; + } + return FetchVector{sampler->buf, P.x * 4 + P.y * sampler->stride, + sampler->stride}; +} + +SI vec4 texelFetchUnchecked(sampler2D sampler, FetchVector ptr, int x, + int y = 0) { + assert(sampler->format == TextureFormat::RGBA32F); + return fetchOffsetsFloat(&ptr.buf[x * 4 + y * ptr.stride], ptr.offset); +} + +SI ivec4 texelFetchUnchecked(isampler2D sampler, FetchVector ptr, int x, + int y = 0) { + assert(sampler->format == TextureFormat::RGBA32I); + return fetchOffsetsInt(&ptr.buf[x * 4 + y * ptr.stride], ptr.offset); +} + +#define texelFetchOffset(sampler, P, lod, offset) \ + texelFetch(sampler, (P) + (offset), lod) + +// Scale texture coords for quantization, subtract offset for filtering +// (assuming coords already offset to texel centers), and round to nearest +// 1/scale increment +template +SI T linearQuantize(T P, float scale) { + return P * scale + (0.5f - 0.5f * scale); +} + +// Helper version that also scales normalized texture coords for sampler +template +SI T samplerScale(S sampler, T P) { + P.x *= sampler->width; + P.y *= sampler->height; + return P; +} + +template +SI T samplerScale(UNUSED sampler2DRect sampler, T P) { + return P; +} + +template +SI T linearQuantize(T P, float scale, S sampler) { + return linearQuantize(samplerScale(sampler, P), scale); +} + +// Compute clamped offset of first row for linear interpolation +template +SI auto computeRow(S sampler, I i, size_t margin = 1) -> decltype(i.x) { + return clampCoord(i.x, sampler->width - margin) + + clampCoord(i.y, sampler->height) * sampler->stride; +} + +// Compute clamped offset of second row for linear interpolation from first row +template +SI auto computeNextRowOffset(S sampler, I i) -> decltype(i.x) { + return if_then_else(i.y >= 0 && i.y < int32_t(sampler->height) - 1, + sampler->stride, 0); +} + +// Convert X coordinate to a 2^7 scale fraction for interpolation +template +SI I16 computeFracX(S sampler, ivec2 i, ivec2 frac) { + auto overread = i.x > int32_t(sampler->width) - 2; + return CONVERT((((frac.x & (i.x >= 0)) | overread) & 0x7F) - overread, I16); +} + +// Convert Y coordinate to a 2^7 scale fraction for interpolation +SI I16 computeFracNoClamp(I32 frac) { return CONVERT(frac & 0x7F, I16); } +SI I16 computeFracY(ivec2 frac) { return computeFracNoClamp(frac.y); } + +struct WidePlanarRGBA8 { + V8 rg; + V8 ba; +}; + +template +SI WidePlanarRGBA8 textureLinearPlanarRGBA8(S sampler, ivec2 i) { + assert(sampler->format == TextureFormat::RGBA8); + + ivec2 frac = i; + i >>= 7; + + I32 row0 = computeRow(sampler, i); + I32 row1 = row0 + computeNextRowOffset(sampler, i); + I16 fracx = computeFracX(sampler, i, frac); + I16 fracy = computeFracY(frac); + + auto a0 = + CONVERT(unaligned_load>(&sampler->buf[row0.x]), V8); + auto a1 = + CONVERT(unaligned_load>(&sampler->buf[row1.x]), V8); + a0 += ((a1 - a0) * fracy.x) >> 7; + + auto b0 = + CONVERT(unaligned_load>(&sampler->buf[row0.y]), V8); + auto b1 = + CONVERT(unaligned_load>(&sampler->buf[row1.y]), V8); + b0 += ((b1 - b0) * fracy.y) >> 7; + + auto abl = zipLow(a0, b0); + auto abh = zipHigh(a0, b0); + abl += ((abh - abl) * fracx.xyxyxyxy) >> 7; + + auto c0 = + CONVERT(unaligned_load>(&sampler->buf[row0.z]), V8); + auto c1 = + CONVERT(unaligned_load>(&sampler->buf[row1.z]), V8); + c0 += ((c1 - c0) * fracy.z) >> 7; + + auto d0 = + CONVERT(unaligned_load>(&sampler->buf[row0.w]), V8); + auto d1 = + CONVERT(unaligned_load>(&sampler->buf[row1.w]), V8); + d0 += ((d1 - d0) * fracy.w) >> 7; + + auto cdl = zipLow(c0, d0); + auto cdh = zipHigh(c0, d0); + cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7; + + auto rg = V8(zip2Low(abl, cdl)); + auto ba = V8(zip2High(abl, cdl)); + return WidePlanarRGBA8{rg, ba}; +} + +template +vec4 textureLinearRGBA8(S sampler, vec2 P) { + ivec2 i(linearQuantize(P, 128, sampler)); + auto planar = textureLinearPlanarRGBA8(sampler, i); + auto rg = CONVERT(planar.rg, V8); + auto ba = CONVERT(planar.ba, V8); + auto r = lowHalf(rg); + auto g = highHalf(rg); + auto b = lowHalf(ba); + auto a = highHalf(ba); + return vec4(b, g, r, a) * (1.0f / 255.0f); +} + +template +static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i) { + assert(sampler->format == TextureFormat::R8); + ivec2 frac = i; + i >>= 7; + + I32 row0 = computeRow(sampler, i); + I32 row1 = row0 + computeNextRowOffset(sampler, i); + I16 fracx = computeFracX(sampler, i, frac); + I16 fracy = computeFracY(frac); + + uint8_t* buf = (uint8_t*)sampler->buf; + auto a0 = unaligned_load>(&buf[row0.x]); + auto b0 = unaligned_load>(&buf[row0.y]); + auto c0 = unaligned_load>(&buf[row0.z]); + auto d0 = unaligned_load>(&buf[row0.w]); + auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8); + + auto a1 = unaligned_load>(&buf[row1.x]); + auto b1 = unaligned_load>(&buf[row1.y]); + auto c1 = unaligned_load>(&buf[row1.z]); + auto d1 = unaligned_load>(&buf[row1.w]); + auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8); + + abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7; + + abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7); + auto abcdl = lowHalf(abcd0); + auto abcdh = highHalf(abcd0); + abcdl += ((abcdh - abcdl) * fracx) >> 7; + + return U16(abcdl); +} + +template +vec4 textureLinearR8(S sampler, vec2 P) { + assert(sampler->format == TextureFormat::R8); + + ivec2 i(linearQuantize(P, 128, sampler)); + Float r = CONVERT(textureLinearUnpackedR8(sampler, i), Float); + return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f); +} + +struct WidePlanarRG8 { + V8 rg; +}; + +template +SI WidePlanarRG8 textureLinearPlanarRG8(S sampler, ivec2 i) { + assert(sampler->format == TextureFormat::RG8); + + ivec2 frac = i; + i >>= 7; + + I32 row0 = computeRow(sampler, i); + I32 row1 = row0 + computeNextRowOffset(sampler, i); + I16 fracx = computeFracX(sampler, i, frac); + I16 fracy = computeFracY(frac); + + uint16_t* buf = (uint16_t*)sampler->buf; + + // Load RG bytes for two adjacent pixels - rgRG + auto a0 = unaligned_load>(&buf[row0.x]); + auto b0 = unaligned_load>(&buf[row0.y]); + auto ab0 = CONVERT(combine(a0, b0), V8); + // Load two pixels for next row + auto a1 = unaligned_load>(&buf[row1.x]); + auto b1 = unaligned_load>(&buf[row1.y]); + auto ab1 = CONVERT(combine(a1, b1), V8); + // Blend rows + ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7; + + auto c0 = unaligned_load>(&buf[row0.z]); + auto d0 = unaligned_load>(&buf[row0.w]); + auto cd0 = CONVERT(combine(c0, d0), V8); + auto c1 = unaligned_load>(&buf[row1.z]); + auto d1 = unaligned_load>(&buf[row1.w]); + auto cd1 = CONVERT(combine(c1, d1), V8); + // Blend rows + cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7; + + // ab = a.rgRG,b.rgRG + // cd = c.rgRG,d.rgRG + // ... ac = ar,cr,ag,cg,aR,cR,aG,cG + // ... bd = br,dr,bg,dg,bR,dR,bG,dG + auto ac = zipLow(ab0, cd0); + auto bd = zipHigh(ab0, cd0); + // ar,br,cr,dr,ag,bg,cg,dg + // aR,bR,cR,dR,aG,bG,cG,dG + auto abcdl = zipLow(ac, bd); + auto abcdh = zipHigh(ac, bd); + // Blend columns + abcdl += ((abcdh - abcdl) * fracx.xyzwxyzw) >> 7; + + auto rg = V8(abcdl); + return WidePlanarRG8{rg}; +} + +template +vec4 textureLinearRG8(S sampler, vec2 P) { + ivec2 i(linearQuantize(P, 128, sampler)); + auto planar = textureLinearPlanarRG8(sampler, i); + auto rg = CONVERT(planar.rg, V8) * (1.0f / 255.0f); + auto r = lowHalf(rg); + auto g = highHalf(rg); + return vec4(r, g, 0.0f, 1.0f); +} + +// Samples R16 texture with linear filtering and returns results packed as +// signed I16. One bit of precision is shifted away from the bottom end to +// accommodate the sign bit, so only 15 bits of precision is left. +template +static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i) { + assert(sampler->format == TextureFormat::R16); + + ivec2 frac = i; + i >>= 7; + + I32 row0 = computeRow(sampler, i); + I32 row1 = row0 + computeNextRowOffset(sampler, i); + + I16 fracx = + CONVERT( + ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F, + I16) + << 8; + I16 fracy = computeFracY(frac) << 8; + + // Sample the 16 bit data for both rows + uint16_t* buf = (uint16_t*)sampler->buf; + auto a0 = unaligned_load>(&buf[row0.x]); + auto b0 = unaligned_load>(&buf[row0.y]); + auto c0 = unaligned_load>(&buf[row0.z]); + auto d0 = unaligned_load>(&buf[row0.w]); + auto abcd0 = CONVERT(combine(a0, b0, c0, d0) >> 1, V8); + + auto a1 = unaligned_load>(&buf[row1.x]); + auto b1 = unaligned_load>(&buf[row1.y]); + auto c1 = unaligned_load>(&buf[row1.z]); + auto d1 = unaligned_load>(&buf[row1.w]); + auto abcd1 = CONVERT(combine(a1, b1, c1, d1) >> 1, V8); + + // The samples occupy 15 bits and the fraction occupies 15 bits, so that when + // they are multiplied together, the new scaled sample will fit in the high + // 14 bits of the result. It is left shifted once to make it 15 bits again + // for the final multiply. +#if USE_SSE2 + abcd0 += bit_cast>(_mm_mulhi_epi16(abcd1 - abcd0, fracy.xxyyzzww)) + << 1; +#elif USE_NEON + // NEON has a convenient instruction that does both the multiply and the + // doubling, so doesn't need an extra shift. + abcd0 += bit_cast>(vqrdmulhq_s16(abcd1 - abcd0, fracy.xxyyzzww)); +#else + abcd0 += CONVERT((CONVERT(abcd1 - abcd0, V8) * + CONVERT(fracy.xxyyzzww, V8)) >> + 16, + V8) + << 1; +#endif + + abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7); + auto abcdl = lowHalf(abcd0); + auto abcdh = highHalf(abcd0); +#if USE_SSE2 + abcdl += lowHalf(bit_cast>( + _mm_mulhi_epi16(expand(abcdh - abcdl), expand(fracx)))) + << 1; +#elif USE_NEON + abcdl += bit_cast>(vqrdmulh_s16(abcdh - abcdl, fracx)); +#else + abcdl += CONVERT((CONVERT(abcdh - abcdl, V4) * + CONVERT(fracx, V4)) >> + 16, + V4) + << 1; +#endif + + return abcdl; +} + +template +vec4 textureLinearR16(S sampler, vec2 P) { + assert(sampler->format == TextureFormat::R16); + + ivec2 i(linearQuantize(P, 128, sampler)); + Float r = CONVERT(textureLinearUnpackedR16(sampler, i), Float); + return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f); +} + +// Samples RG16 texture with linear filtering and returns results packed as +// signed I16. One bit of precision is shifted away from the bottom end to +// accommodate the sign bit, so only 15 bits of precision is left. +template +static inline V8 textureLinearUnpackedRG16(S sampler, ivec2 i) { + assert(sampler->format == TextureFormat::RG16); + + ivec2 frac = i; + i >>= 7; + + I32 row0 = computeRow(sampler, i); + I32 row1 = row0 + computeNextRowOffset(sampler, i); + + I16 fracx = + CONVERT( + ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F, + I16) + << 8; + I16 fracy = computeFracY(frac) << 8; + + // Sample the 2x16 bit data for both rows + auto a0 = unaligned_load>(&sampler->buf[row0.x]); + auto b0 = unaligned_load>(&sampler->buf[row0.y]); + auto ab0 = CONVERT(combine(a0, b0) >> 1, V8); + auto c0 = unaligned_load>(&sampler->buf[row0.z]); + auto d0 = unaligned_load>(&sampler->buf[row0.w]); + auto cd0 = CONVERT(combine(c0, d0) >> 1, V8); + + auto a1 = unaligned_load>(&sampler->buf[row1.x]); + auto b1 = unaligned_load>(&sampler->buf[row1.y]); + auto ab1 = CONVERT(combine(a1, b1) >> 1, V8); + auto c1 = unaligned_load>(&sampler->buf[row1.z]); + auto d1 = unaligned_load>(&sampler->buf[row1.w]); + auto cd1 = CONVERT(combine(c1, d1) >> 1, V8); + + // The samples occupy 15 bits and the fraction occupies 15 bits, so that when + // they are multiplied together, the new scaled sample will fit in the high + // 14 bits of the result. It is left shifted once to make it 15 bits again + // for the final multiply. +#if USE_SSE2 + ab0 += bit_cast>(_mm_mulhi_epi16(ab1 - ab0, fracy.xxxxyyyy)) << 1; + cd0 += bit_cast>(_mm_mulhi_epi16(cd1 - cd0, fracy.zzzzwwww)) << 1; +#elif USE_NEON + // NEON has a convenient instruction that does both the multiply and the + // doubling, so doesn't need an extra shift. + ab0 += bit_cast>(vqrdmulhq_s16(ab1 - ab0, fracy.xxxxyyyy)); + cd0 += bit_cast>(vqrdmulhq_s16(cd1 - cd0, fracy.zzzzwwww)); +#else + ab0 += CONVERT((CONVERT(ab1 - ab0, V8) * + CONVERT(fracy.xxxxyyyy, V8)) >> + 16, + V8) + << 1; + cd0 += CONVERT((CONVERT(cd1 - cd0, V8) * + CONVERT(fracy.zzzzwwww, V8)) >> + 16, + V8) + << 1; +#endif + + // ab = a.rgRG,b.rgRG + // cd = c.rgRG,d.rgRG + // ... ac = a.rg,c.rg,a.RG,c.RG + // ... bd = b.rg,d.rg,b.RG,d.RG + auto ac = zip2Low(ab0, cd0); + auto bd = zip2High(ab0, cd0); + // a.rg,b.rg,c.rg,d.rg + // a.RG,b.RG,c.RG,d.RG + auto abcdl = zip2Low(ac, bd); + auto abcdh = zip2High(ac, bd); + // Blend columns +#if USE_SSE2 + abcdl += bit_cast>(_mm_mulhi_epi16(abcdh - abcdl, fracx.xxyyzzww)) + << 1; +#elif USE_NEON + abcdl += bit_cast>(vqrdmulhq_s16(abcdh - abcdl, fracx.xxyyzzww)); +#else + abcdl += CONVERT((CONVERT(abcdh - abcdl, V8) * + CONVERT(fracx.xxyyzzww, V8)) >> + 16, + V8) + << 1; +#endif + + return abcdl; +} + +template +vec4 textureLinearRG16(S sampler, vec2 P) { + assert(sampler->format == TextureFormat::RG16); + + ivec2 i(linearQuantize(P, 128, sampler)); + auto rg = bit_cast>(textureLinearUnpackedRG16(sampler, i)); + auto r = cast(rg & 0xFFFF) * (1.0f / 32767.0f); + auto g = cast(rg >> 16) * (1.0f / 32767.0f); + return vec4(r, g, 0.0f, 1.0f); +} + +using PackedRGBA32F = V16; +using WideRGBA32F = V16; + +template +vec4 textureLinearRGBA32F(S sampler, vec2 P) { + assert(sampler->format == TextureFormat::RGBA32F); + P = samplerScale(sampler, P); + P -= 0.5f; + vec2 f = floor(P); + vec2 r = P - f; + ivec2 i(f); + ivec2 c(clampCoord(i.x, sampler->width - 1), + clampCoord(i.y, sampler->height)); + r.x = if_then_else(i.x >= 0, if_then_else(i.x < sampler->width - 1, r.x, 1.0), + 0.0f); + I32 offset0 = c.x * 4 + c.y * sampler->stride; + I32 offset1 = offset0 + computeNextRowOffset(sampler, i); + + Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x], + *(Float*)&sampler->buf[offset0.x + 4], r.x), + mix(*(Float*)&sampler->buf[offset1.x], + *(Float*)&sampler->buf[offset1.x + 4], r.x), + r.y); + Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y], + *(Float*)&sampler->buf[offset0.y + 4], r.x), + mix(*(Float*)&sampler->buf[offset1.y], + *(Float*)&sampler->buf[offset1.y + 4], r.x), + r.y); + Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z], + *(Float*)&sampler->buf[offset0.z + 4], r.x), + mix(*(Float*)&sampler->buf[offset1.z], + *(Float*)&sampler->buf[offset1.z + 4], r.x), + r.y); + Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w], + *(Float*)&sampler->buf[offset0.w + 4], r.x), + mix(*(Float*)&sampler->buf[offset1.w], + *(Float*)&sampler->buf[offset1.w + 4], r.x), + r.y); + return pixel_float_to_vec4(c0, c1, c2, c3); +} + +struct WidePlanarYUV8 { + U16 y; + U16 u; + U16 v; +}; + +template +SI WidePlanarYUV8 textureLinearPlanarYUV422(S sampler, ivec2 i) { + assert(sampler->format == TextureFormat::YUV422); + + ivec2 frac = i; + i >>= 7; + + I32 row0 = computeRow(sampler, i, 2); + // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R. + // Get the selector for the pixel within the chunk. + I32 selector = row0 & 1; + // Align the row index to the chunk. + row0 &= ~1; + I32 row1 = row0 + computeNextRowOffset(sampler, i); + // G only needs to be clamped to a pixel boundary for safe interpolation, + // whereas the BR fraction needs to be clamped 1 extra pixel inside to a chunk + // boundary. + frac.x &= (i.x >= 0); + auto fracx = + CONVERT(combine(frac.x | (i.x > int32_t(sampler->width) - 3), + (frac.x >> 1) | (i.x > int32_t(sampler->width) - 3)) & + 0x7F, + V8); + I16 fracy = computeFracY(frac); + + uint16_t* buf = (uint16_t*)sampler->buf; + + // Load bytes for two adjacent chunks - g0,b,g1,r,G0,B,G1,R + // We always need to interpolate between (b,r) and (B,R). + // Depending on selector we need to either interpolate between g0 and g1 + // or between g1 and G0. So for now we just interpolate both cases for g + // and will select the appropriate one on output. + auto a0 = CONVERT(unaligned_load>(&buf[row0.x]), V8); + auto a1 = CONVERT(unaligned_load>(&buf[row1.x]), V8); + // Combine with next row. + a0 += ((a1 - a0) * fracy.x) >> 7; + + auto b0 = CONVERT(unaligned_load>(&buf[row0.y]), V8); + auto b1 = CONVERT(unaligned_load>(&buf[row1.y]), V8); + b0 += ((b1 - b0) * fracy.y) >> 7; + + auto c0 = CONVERT(unaligned_load>(&buf[row0.z]), V8); + auto c1 = CONVERT(unaligned_load>(&buf[row1.z]), V8); + c0 += ((c1 - c0) * fracy.z) >> 7; + + auto d0 = CONVERT(unaligned_load>(&buf[row0.w]), V8); + auto d1 = CONVERT(unaligned_load>(&buf[row1.w]), V8); + d0 += ((d1 - d0) * fracy.w) >> 7; + + // Shuffle things around so we end up with g0,g0,g0,g0,b,b,b,b and + // g1,g1,g1,g1,r,r,r,r. + auto abl = zipLow(a0, b0); + auto cdl = zipLow(c0, d0); + auto g0b = zip2Low(abl, cdl); + auto g1r = zip2High(abl, cdl); + + // Need to zip g1,B,G0,R. Instead of using a bunch of complicated masks and + // and shifts, just shuffle here instead... We finally end up with + // g1,g1,g1,g1,B,B,B,B and G0,G0,G0,G0,R,R,R,R. + auto abh = SHUFFLE(a0, b0, 2, 10, 5, 13, 4, 12, 7, 15); + auto cdh = SHUFFLE(c0, d0, 2, 10, 5, 13, 4, 12, 7, 15); + auto g1B = zip2Low(abh, cdh); + auto G0R = zip2High(abh, cdh); + + // Finally interpolate between adjacent columns. + g0b += ((g1B - g0b) * fracx) >> 7; + g1r += ((G0R - g1r) * fracx) >> 7; + + // Choose either g0 or g1 based on selector. + return WidePlanarYUV8{ + U16(if_then_else(CONVERT(-selector, I16), lowHalf(g1r), lowHalf(g0b))), + U16(highHalf(g0b)), U16(highHalf(g1r))}; +} + +template +vec4 textureLinearYUV422(S sampler, vec2 P) { + ivec2 i(linearQuantize(P, 128, sampler)); + auto planar = textureLinearPlanarYUV422(sampler, i); + auto y = CONVERT(planar.y, Float) * (1.0f / 255.0f); + auto u = CONVERT(planar.u, Float) * (1.0f / 255.0f); + auto v = CONVERT(planar.v, Float) * (1.0f / 255.0f); + return vec4(v, y, u, 1.0f); +} + +SI vec4 texture(sampler2D sampler, vec2 P) { + if (sampler->filter == TextureFilter::LINEAR) { + switch (sampler->format) { + case TextureFormat::RGBA32F: + return textureLinearRGBA32F(sampler, P); + case TextureFormat::RGBA8: + return textureLinearRGBA8(sampler, P); + case TextureFormat::R8: + return textureLinearR8(sampler, P); + case TextureFormat::RG8: + return textureLinearRG8(sampler, P); + case TextureFormat::R16: + return textureLinearR16(sampler, P); + case TextureFormat::RG16: + return textureLinearRG16(sampler, P); + case TextureFormat::YUV422: + return textureLinearYUV422(sampler, P); + default: + assert(false); + return vec4(); + } + } else { + ivec2 coord(roundzero(P.x, sampler->width), + roundzero(P.y, sampler->height)); + return texelFetch(sampler, coord, 0); + } +} + +vec4 texture(sampler2DRect sampler, vec2 P) { + if (sampler->filter == TextureFilter::LINEAR) { + switch (sampler->format) { + case TextureFormat::RGBA8: + return textureLinearRGBA8(sampler, P); + case TextureFormat::R8: + return textureLinearR8(sampler, P); + case TextureFormat::RG8: + return textureLinearRG8(sampler, P); + case TextureFormat::R16: + return textureLinearR16(sampler, P); + case TextureFormat::RG16: + return textureLinearRG16(sampler, P); + case TextureFormat::YUV422: + return textureLinearYUV422(sampler, P); + default: + assert(false); + return vec4(); + } + } else { + ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f)); + return texelFetch(sampler, coord); + } +} + +template +vec4_scalar texture(S sampler, vec2_scalar P) { + return force_scalar(texture(sampler, vec2(P))); +} + +ivec2_scalar textureSize(sampler2D sampler, int) { + return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)}; +} + +ivec2_scalar textureSize(sampler2DRect sampler) { + return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)}; +} + +template +static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) { + assert(sampler->format == TextureFormat::RGBA8); + ivec2 frac = i; + i >>= 7; + + I32 row0 = computeRow(sampler, i); + I32 row1 = row0 + computeNextRowOffset(sampler, i); + I16 fracx = computeFracX(sampler, i, frac); + I16 fracy = computeFracY(frac); + + auto a0 = + CONVERT(unaligned_load>(&sampler->buf[row0.x]), V8); + auto a1 = + CONVERT(unaligned_load>(&sampler->buf[row1.x]), V8); + a0 += ((a1 - a0) * fracy.x) >> 7; + + auto b0 = + CONVERT(unaligned_load>(&sampler->buf[row0.y]), V8); + auto b1 = + CONVERT(unaligned_load>(&sampler->buf[row1.y]), V8); + b0 += ((b1 - b0) * fracy.y) >> 7; + + auto abl = combine(lowHalf(a0), lowHalf(b0)); + auto abh = combine(highHalf(a0), highHalf(b0)); + abl += ((abh - abl) * fracx.xxxxyyyy) >> 7; + + auto c0 = + CONVERT(unaligned_load>(&sampler->buf[row0.z]), V8); + auto c1 = + CONVERT(unaligned_load>(&sampler->buf[row1.z]), V8); + c0 += ((c1 - c0) * fracy.z) >> 7; + + auto d0 = + CONVERT(unaligned_load>(&sampler->buf[row0.w]), V8); + auto d1 = + CONVERT(unaligned_load>(&sampler->buf[row1.w]), V8); + d0 += ((d1 - d0) * fracy.w) >> 7; + + auto cdl = combine(lowHalf(c0), lowHalf(d0)); + auto cdh = combine(highHalf(c0), highHalf(d0)); + cdl += ((cdh - cdl) * fracx.zzzzwwww) >> 7; + + return combine(HalfRGBA8(abl), HalfRGBA8(cdl)); +} + +template +static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i) { + return pack(textureLinearUnpackedRGBA8(sampler, i)); +} + +template +static PackedRGBA8 textureNearestPackedRGBA8(S sampler, ivec2 i) { + assert(sampler->format == TextureFormat::RGBA8); + I32 row = computeRow(sampler, i, 0); + return combine(unaligned_load>(&sampler->buf[row.x]), + unaligned_load>(&sampler->buf[row.y]), + unaligned_load>(&sampler->buf[row.z]), + unaligned_load>(&sampler->buf[row.w])); +} + +template +static PackedR8 textureLinearPackedR8(S sampler, ivec2 i) { + return pack(textureLinearUnpackedR8(sampler, i)); +} + +template +static WideRG8 textureLinearUnpackedRG8(S sampler, ivec2 i) { + assert(sampler->format == TextureFormat::RG8); + ivec2 frac = i & 0x7F; + i >>= 7; + + I32 row0 = computeRow(sampler, i); + I32 row1 = row0 + computeNextRowOffset(sampler, i); + I16 fracx = computeFracX(sampler, i, frac); + I16 fracy = computeFracY(frac); + + uint16_t* buf = (uint16_t*)sampler->buf; + + // Load RG bytes for two adjacent pixels - rgRG + auto a0 = unaligned_load>(&buf[row0.x]); + auto b0 = unaligned_load>(&buf[row0.y]); + auto ab0 = CONVERT(combine(a0, b0), V8); + // Load two pixels for next row + auto a1 = unaligned_load>(&buf[row1.x]); + auto b1 = unaligned_load>(&buf[row1.y]); + auto ab1 = CONVERT(combine(a1, b1), V8); + // Blend rows + ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7; + + auto c0 = unaligned_load>(&buf[row0.z]); + auto d0 = unaligned_load>(&buf[row0.w]); + auto cd0 = CONVERT(combine(c0, d0), V8); + auto c1 = unaligned_load>(&buf[row1.z]); + auto d1 = unaligned_load>(&buf[row1.w]); + auto cd1 = CONVERT(combine(c1, d1), V8); + // Blend rows + cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7; + + // ab = a.rgRG,b.rgRG + // cd = c.rgRG,d.rgRG + // ... ac = a.rg,c.rg,a.RG,c.RG + // ... bd = b.rg,d.rg,b.RG,d.RG + auto ac = zip2Low(ab0, cd0); + auto bd = zip2High(ab0, cd0); + // a.rg,b.rg,c.rg,d.rg + // a.RG,b.RG,c.RG,d.RG + auto abcdl = zip2Low(ac, bd); + auto abcdh = zip2High(ac, bd); + // Blend columns + abcdl += ((abcdh - abcdl) * fracx.xxyyzzww) >> 7; + + return WideRG8(abcdl); +} + +template +static PackedRG8 textureLinearPackedRG8(S sampler, ivec2 i) { + return pack(textureLinearUnpackedRG8(sampler, i)); +} + +template +static ALWAYS_INLINE VectorType addsat(VectorType x, + VectorType y) { + auto r = x + y; + return r | (r < x); +} + +static inline V8 addsat(V8 x, V8 y) { +#if USE_SSE2 + return _mm_adds_epu16(x, y); +#elif USE_NEON + return vqaddq_u16(x, y); +#else + auto r = x + y; + return r | (r < x); +#endif +} + +template +static VectorType gaussianBlurHorizontal( + S sampler, const ivec2_scalar& i, int minX, int maxX, int radius, + float coeff, float coeffStep) { + // Packed and unpacked vectors for a chunk of the given pixel type. + typedef VectorType packed_type; + typedef VectorType unpacked_type; + + // Pre-scale the coefficient by 8 bits of fractional precision, so that when + // the sample is multiplied by it, it will yield a 16 bit unsigned integer + // that will use all 16 bits of precision to accumulate the sum. + coeff *= 1 << 8; + float coeffStep2 = coeffStep * coeffStep; + + int row = computeRow(sampler, i); + P* buf = (P*)sampler->buf; + auto pixelsRight = unaligned_load>(&buf[row]); + auto pixelsLeft = pixelsRight; + auto sum = CONVERT(bit_cast(pixelsRight), unpacked_type) * + uint16_t(coeff + 0.5f); + + // Here we use some trickery to reuse the pixels within a chunk, shifted over + // by one pixel, to get the next sample for the entire chunk. This allows us + // to sample only one pixel for each offset across the entire chunk in both + // the left and right directions. To avoid clamping within the loop to the + // texture bounds, we compute the valid radius that doesn't require clamping + // and fall back to a slower clamping loop outside of that valid radius. + int offset = 1; + // The left bound is how much we can offset the sample before the start of + // the row bounds. + int leftBound = i.x - max(minX, 0); + // The right bound is how much we can offset the sample before the end of the + // row bounds. + int rightBound = min(maxX, sampler->width - 1) - i.x; + int validRadius = min(radius, min(leftBound, rightBound - (4 - 1))); + for (; offset <= validRadius; offset++) { + // Overwrite the pixel that needs to be shifted out with the new pixel, and + // shift it into the correct location. + pixelsRight.x = unaligned_load

(&buf[row + offset + 4 - 1]); + pixelsRight = pixelsRight.yzwx; + pixelsLeft = pixelsLeft.wxyz; + pixelsLeft.x = unaligned_load

(&buf[row - offset]); + + // Accumulate the Gaussian coefficients step-wise. + coeff *= coeffStep; + coeffStep *= coeffStep2; + + // Both left and right samples at this offset use the same coefficient. + sum = addsat(sum, + (CONVERT(bit_cast(pixelsRight), unpacked_type) + + CONVERT(bit_cast(pixelsLeft), unpacked_type)) * + uint16_t(coeff + 0.5f)); + } + + for (; offset <= radius; offset++) { + pixelsRight.x = + unaligned_load

(&buf[row + min(offset + 4 - 1, rightBound)]); + pixelsRight = pixelsRight.yzwx; + pixelsLeft = pixelsLeft.wxyz; + pixelsLeft.x = unaligned_load

(&buf[row - min(offset, leftBound)]); + + coeff *= coeffStep; + coeffStep *= coeffStep2; + + sum = addsat(sum, + (CONVERT(bit_cast(pixelsRight), unpacked_type) + + CONVERT(bit_cast(pixelsLeft), unpacked_type)) * + uint16_t(coeff + 0.5f)); + } + + // Shift away the intermediate precision. + return sum >> 8; +} + +template +static VectorType gaussianBlurVertical( + S sampler, const ivec2_scalar& i, int minY, int maxY, int radius, + float coeff, float coeffStep) { + // Packed and unpacked vectors for a chunk of the given pixel type. + typedef VectorType packed_type; + typedef VectorType unpacked_type; + + // Pre-scale the coefficient by 8 bits of fractional precision, so that when + // the sample is multiplied by it, it will yield a 16 bit unsigned integer + // that will use all 16 bits of precision to accumulate the sum. + coeff *= 1 << 8; + float coeffStep2 = coeffStep * coeffStep; + + int rowAbove = computeRow(sampler, i); + int rowBelow = rowAbove; + P* buf = (P*)sampler->buf; + auto pixels = unaligned_load>(&buf[rowAbove]); + auto sum = CONVERT(bit_cast(pixels), unpacked_type) * + uint16_t(coeff + 0.5f); + + // For the vertical loop we can't be quite as creative with reusing old values + // as we were in the horizontal loop. We just do the obvious implementation of + // loading a chunk from each row in turn and accumulating it into the sum. We + // compute a valid radius within which we don't need to clamp the sampled row + // and use that to avoid any clamping in the main inner loop. We fall back to + // a slower clamping loop outside of that valid radius. + int offset = 1; + int belowBound = i.y - max(minY, 0); + int aboveBound = min(maxY, sampler->height - 1) - i.y; + int validRadius = min(radius, min(belowBound, aboveBound)); + for (; offset <= validRadius; offset++) { + rowAbove += sampler->stride; + rowBelow -= sampler->stride; + auto pixelsAbove = unaligned_load>(&buf[rowAbove]); + auto pixelsBelow = unaligned_load>(&buf[rowBelow]); + + // Accumulate the Gaussian coefficients step-wise. + coeff *= coeffStep; + coeffStep *= coeffStep2; + + // Both above and below samples at this offset use the same coefficient. + sum = addsat(sum, + (CONVERT(bit_cast(pixelsAbove), unpacked_type) + + CONVERT(bit_cast(pixelsBelow), unpacked_type)) * + uint16_t(coeff + 0.5f)); + } + + for (; offset <= radius; offset++) { + if (offset <= aboveBound) { + rowAbove += sampler->stride; + } + if (offset <= belowBound) { + rowBelow -= sampler->stride; + } + auto pixelsAbove = unaligned_load>(&buf[rowAbove]); + auto pixelsBelow = unaligned_load>(&buf[rowBelow]); + + coeff *= coeffStep; + coeffStep *= coeffStep2; + + sum = addsat(sum, + (CONVERT(bit_cast(pixelsAbove), unpacked_type) + + CONVERT(bit_cast(pixelsBelow), unpacked_type)) * + uint16_t(coeff + 0.5f)); + } + + // Shift away the intermediate precision. + return sum >> 8; +} + +} // namespace glsl diff --git a/gfx/wr/swgl/src/vector_type.h b/gfx/wr/swgl/src/vector_type.h new file mode 100644 index 0000000000..43364ffcce --- /dev/null +++ b/gfx/wr/swgl/src/vector_type.h @@ -0,0 +1,563 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef __clang__ +# ifdef __SSE2__ +# include +# define USE_SSE2 1 +# endif +# ifdef __ARM_NEON +# include +# define USE_NEON 1 +# endif +#endif + +namespace glsl { + +#ifdef __clang__ +template +using VectorType = T __attribute__((ext_vector_type(N))); + +# define CONVERT(vector, type) __builtin_convertvector(vector, type) +# define SHUFFLE(a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__) + +template +SI VectorType combine(VectorType a, VectorType b) { + return __builtin_shufflevector(a, b, 0, 1, 2, 3); +} + +template +SI VectorType combine(VectorType a, VectorType b) { + return __builtin_shufflevector(a, b, 0, 1, 2, 3, 4, 5, 6, 7); +} + +template +SI VectorType combine(VectorType a, VectorType b) { + return __builtin_shufflevector(a, b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15); +} + +template +SI VectorType lowHalf(VectorType a) { + return __builtin_shufflevector(a, a, 0, 1); +} + +template +SI VectorType highHalf(VectorType a) { + return __builtin_shufflevector(a, a, 2, 3); +} + +template +SI VectorType lowHalf(VectorType a) { + return __builtin_shufflevector(a, a, 0, 1, 2, 3); +} + +template +SI VectorType highHalf(VectorType a) { + return __builtin_shufflevector(a, a, 4, 5, 6, 7); +} + +template +SI VectorType lowHalf(VectorType a) { + return __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +template +SI VectorType highHalf(VectorType a) { + return __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15); +} + +template +SI VectorType expand(VectorType a) { + return __builtin_shufflevector(a, a, 0, 1, 2, 3, -1, -1, -1, -1); +} +#else +template +struct VectorMask { + typedef T type; +}; +template <> +struct VectorMask { + typedef int32_t type; +}; +template <> +struct VectorMask { + typedef int16_t type; +}; +template <> +struct VectorMask { + typedef int8_t type; +}; +template <> +struct VectorMask { + typedef int type; +}; + +template +struct VectorType { + enum { SIZE = N }; + + typedef T data_type __attribute__((vector_size(sizeof(T) * N))); + typedef typename VectorMask::type mask_index; + typedef mask_index mask_type + __attribute__((vector_size(sizeof(mask_index) * N))); + typedef T half_type __attribute__((vector_size(sizeof(T) * (N / 2)))); + union { + data_type data; + struct { + T x, y, z, w; + }; + T elements[N]; + struct { + half_type low_half, high_half; + }; + }; + + VectorType() : data{0} {} + + constexpr VectorType(const VectorType& rhs) : data(rhs.data) {} + // GCC vector extensions only support broadcasting scalars on arithmetic ops, + // but not on initializers, hence the following... + constexpr VectorType(T n) : data((data_type){0} + n) {} + constexpr VectorType(T a, T b, T c, T d) : data{a, b, c, d} {} + constexpr VectorType(T a, T b, T c, T d, T e, T f, T g, T h) + : data{a, b, c, d, e, f, g, h} {} + constexpr VectorType(T a, T b, T c, T d, T e, T f, T g, T h, T i, T j, T k, + T l, T m, T n, T o, T p) + : data{a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p} {} + + SI VectorType wrap(const data_type& data) { + VectorType v; + v.data = data; + return v; + } + + T& operator[](size_t i) { return elements[i]; } + T operator[](size_t i) const { return elements[i]; } + + template + operator VectorType() const { + return VectorType::wrap( + (typename VectorType::data_type){U(x), U(y)}); + } + template + operator VectorType() const { + return VectorType::wrap( + (typename VectorType::data_type){U(x), U(y), U(z), U(w)}); + } + template + operator VectorType() const { + return VectorType::wrap((typename VectorType::data_type){ + U(elements[0]), U(elements[1]), U(elements[2]), U(elements[3]), + U(elements[4]), U(elements[5]), U(elements[6]), U(elements[7])}); + } + template + operator VectorType() const { + return VectorType::wrap((typename VectorType::data_type){ + U(elements[0]), + U(elements[1]), + U(elements[2]), + U(elements[3]), + U(elements[4]), + U(elements[5]), + U(elements[6]), + U(elements[7]), + U(elements[8]), + U(elements[9]), + U(elements[10]), + U(elements[11]), + U(elements[12]), + U(elements[13]), + U(elements[14]), + U(elements[15]), + }); + } + + VectorType operator-() const { return wrap(-data); } + VectorType operator~() const { return wrap(~data); } + + VectorType operator&(VectorType x) const { return wrap(data & x.data); } + VectorType operator&(T x) const { return wrap(data & x); } + VectorType operator|(VectorType x) const { return wrap(data | x.data); } + VectorType operator|(T x) const { return wrap(data | x); } + VectorType operator^(VectorType x) const { return wrap(data ^ x.data); } + VectorType operator^(T x) const { return wrap(data ^ x); } + VectorType operator<<(int x) const { return wrap(data << x); } + VectorType operator>>(int x) const { return wrap(data >> x); } + VectorType operator+(VectorType x) const { return wrap(data + x.data); } + VectorType operator+(T x) const { return wrap(data + x); } + friend VectorType operator+(T x, VectorType y) { return wrap(x + y.data); } + VectorType operator-(VectorType x) const { return wrap(data - x.data); } + VectorType operator-(T x) const { return wrap(data - x); } + friend VectorType operator-(T x, VectorType y) { return wrap(x - y.data); } + VectorType operator*(VectorType x) const { return wrap(data * x.data); } + VectorType operator*(T x) const { return wrap(data * x); } + friend VectorType operator*(T x, VectorType y) { return wrap(x * y.data); } + VectorType operator/(VectorType x) const { return wrap(data / x.data); } + VectorType operator/(T x) const { return wrap(data / x); } + friend VectorType operator/(T x, VectorType y) { return wrap(x / y.data); } + VectorType operator%(int x) const { return wrap(data % x); } + + VectorType& operator&=(VectorType x) { + data &= x.data; + return *this; + } + VectorType& operator|=(VectorType x) { + data |= x.data; + return *this; + } + VectorType& operator^=(VectorType x) { + data ^= x.data; + return *this; + } + VectorType& operator<<=(int x) { + data <<= x; + return *this; + } + VectorType& operator>>=(int x) { + data >>= x; + return *this; + } + VectorType& operator+=(VectorType x) { + data += x.data; + return *this; + } + VectorType& operator-=(VectorType x) { + data -= x.data; + return *this; + } + VectorType& operator*=(VectorType x) { + data *= x.data; + return *this; + } + VectorType& operator/=(VectorType x) { + data /= x.data; + return *this; + } + VectorType& operator%=(int x) { + data %= x; + return *this; + } + + VectorType operator==(VectorType x) const { + return VectorType::wrap(data == x.data); + } + VectorType operator!=(VectorType x) const { + return VectorType::wrap(data != x.data); + } + VectorType operator<(VectorType x) const { + return VectorType::wrap(data < x.data); + } + VectorType operator>(VectorType x) const { + return VectorType::wrap(data > x.data); + } + VectorType operator<=(VectorType x) const { + return VectorType::wrap(data <= x.data); + } + VectorType operator>=(VectorType x) const { + return VectorType::wrap(data >= x.data); + } + + VectorType operator!() const { return wrap(!data); } + VectorType operator&&(VectorType x) const { return wrap(data & x.data); } + VectorType operator||(VectorType x) const { return wrap(data | x.data); } + + VectorType& operator=(VectorType x) { + data = x.data; + return *this; + } + + VectorType shuffle(VectorType b, mask_index x, mask_index y, + mask_index z, mask_index w) const { + return VectorType::wrap(__builtin_shuffle( + data, b.data, (typename VectorType::mask_type){x, y, z, w})); + } + VectorType shuffle(VectorType b, mask_index x, mask_index y, + mask_index z, mask_index w, mask_index s, + mask_index t, mask_index u, mask_index v) const { + return VectorType::wrap(__builtin_shuffle( + data, b.data, + (typename VectorType::mask_type){x, y, z, w, s, t, u, v})); + } + VectorType shuffle(VectorType b, mask_index x, mask_index y, + mask_index z, mask_index w, mask_index s, + mask_index t, mask_index u, mask_index v, + mask_index i, mask_index j, mask_index k, + mask_index l, mask_index m, mask_index n, + mask_index o, mask_index p) const { + return VectorType::wrap( + __builtin_shuffle(data, b.data, + (typename VectorType::mask_type){ + x, y, z, w, s, t, u, v, i, j, k, l, m, n, o, p})); + } + + VectorType swizzle(mask_index x, mask_index y, mask_index z, + mask_index w) const { + return VectorType::wrap(__builtin_shuffle( + data, (typename VectorType::mask_type){x, y, z, w})); + } + VectorType swizzle(mask_index x, mask_index y, mask_index z, + mask_index w, mask_index s, mask_index t, + mask_index u, mask_index v) const { + return VectorType::wrap(__builtin_shuffle( + data, (typename VectorType::mask_type){x, y, z, w, s, t, u, v})); + } + + SI VectorType wrap(half_type low, half_type high) { + VectorType v; + v.low_half = low; + v.high_half = high; + return v; + } + + VectorType combine(VectorType high) const { + return VectorType::wrap(data, high.data); + } + +# define xxxx swizzle(0, 0, 0, 0) +# define yyyy swizzle(1, 1, 1, 1) +# define zzzz swizzle(2, 2, 2, 2) +# define wwww swizzle(3, 3, 3, 3) +# define xxyy swizzle(0, 0, 1, 1) +# define xxzz swizzle(0, 0, 2, 2) +# define yyww swizzle(1, 1, 3, 3) +# define zzww swizzle(2, 2, 3, 3) +# define xyxy swizzle(0, 1, 0, 1) +# define xzxz swizzle(0, 2, 0, 2) +# define ywyw swizzle(1, 3, 1, 3) +# define zwzw swizzle(2, 3, 2, 3) +# define zwxy swizzle(2, 3, 0, 1) +# define zyxw swizzle(2, 1, 0, 3) +# define xxyz swizzle(0, 0, 1, 2) +# define xyyz swizzle(0, 1, 1, 2) +# define xyzz swizzle(0, 1, 2, 2) +# define xzyw swizzle(0, 2, 1, 3) +# define yzwx swizzle(1, 2, 3, 0) +# define wxyz swizzle(3, 0, 1, 2) +# define wzyx swizzle(3, 2, 1, 0) +# define xxxxyyyy XXXXYYYY() + VectorType XXXXYYYY() const { + return swizzle(0, 0, 0, 0).combine(swizzle(1, 1, 1, 1)); + } +# define zzzzwwww ZZZZWWWW() + VectorType ZZZZWWWW() const { + return swizzle(2, 2, 2, 2).combine(swizzle(3, 3, 3, 3)); + } +# define xyzwxyzw XYZWXYZW() + VectorType XYZWXYZW() const { return combine(*this); } +# define xyxyxyxy XYXYXYXY() + VectorType XYXYXYXY() const { + return swizzle(0, 1, 0, 1).combine(swizzle(0, 1, 0, 1)); + } +# define zwzwzwzw ZWZWZWZW() + VectorType ZWZWZWZW() const { + return swizzle(2, 3, 2, 3).combine(swizzle(2, 3, 2, 3)); + } +# define xxyyzzww XXYYZZWW() + VectorType XXYYZZWW() const { + return swizzle(0, 0, 1, 1).combine(swizzle(2, 2, 3, 3)); + } +# define xxxxyyyyzzzzwwww XXXXYYYYZZZZWWWW() + VectorType XXXXYYYYZZZZWWWW() { + return XXXXYYYY().combine(ZZZZWWWW()); + } +}; + +template +struct VectorType { + typedef T data_type __attribute__((vector_size(sizeof(T) * 2))); + union { + data_type data; + struct { + T x, y; + }; + T elements[2]; + }; + + SI VectorType wrap(const data_type& data) { + VectorType v; + v.data = data; + return v; + } + + VectorType operator&(VectorType x) const { return wrap(data & x.data); } + VectorType operator&(T x) const { return wrap(data & x); } + VectorType operator|(VectorType x) const { return wrap(data | x.data); } + VectorType operator|(T x) const { return wrap(data | x); } +}; + +# define CONVERT(vector, type) ((type)(vector)) +# define SHUFFLE(a, b, ...) a.shuffle(b, __VA_ARGS__) + +template +SI VectorType combine(VectorType a, VectorType b) { + return VectorType::wrap(a.data, b.data); +} + +template +SI VectorType lowHalf(VectorType a) { + return VectorType::wrap(a.low_half); +} + +template +SI VectorType highHalf(VectorType a) { + return VectorType::wrap(a.high_half); +} + +template +SI VectorType expand(VectorType a) { + return combine(a, a); +} +#endif + +template +SI VectorType combine(VectorType a, VectorType b, + VectorType c, VectorType d) { + return combine(combine(a, b), combine(c, d)); +} + +template +SI VectorType combineLow(VectorType a, VectorType b) { + return combine(lowHalf(a), lowHalf(b)); +} + +template +SI VectorType combineHigh(VectorType a, VectorType b) { + return combine(highHalf(a), highHalf(b)); +} + +template +SI VectorType repeat2(VectorType a) { + return combine(a, a); +} + +template +SI VectorType repeat4(VectorType a) { + return combine(a, a, a, a); +} + +template +SI VectorType zipLow(VectorType a, VectorType b) { + return SHUFFLE(a, b, 0, 4, 1, 5); +} + +template +SI VectorType zipHigh(VectorType a, VectorType b) { + return SHUFFLE(a, b, 2, 6, 3, 7); +} + +template +SI VectorType zipLow(VectorType a, VectorType b) { + return SHUFFLE(a, b, 0, 8, 1, 9, 2, 10, 3, 11); +} + +template +SI VectorType zipHigh(VectorType a, VectorType b) { + return SHUFFLE(a, b, 4, 12, 5, 13, 6, 14, 7, 15); +} + +template +SI VectorType zipLow(VectorType a, VectorType b) { + return SHUFFLE(a, b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); +} + +template +SI VectorType zipHigh(VectorType a, VectorType b) { + return SHUFFLE(a, b, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, + 31); +} + +template +SI VectorType zip2Low(VectorType a, VectorType b) { + return SHUFFLE(a, b, 0, 1, 8, 9, 2, 3, 10, 11); +} + +template +SI VectorType zip2High(VectorType a, VectorType b) { + return SHUFFLE(a, b, 4, 5, 12, 13, 6, 7, 14, 15); +} + +#ifdef __clang__ +template +SI VectorType zip(VectorType a, VectorType b) { + return SHUFFLE(a, b, 0, 4, 1, 5, 2, 6, 3, 7); +} + +template +SI VectorType zip(VectorType a, VectorType b) { + return SHUFFLE(a, b, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); +} +#else +template +SI VectorType zip(VectorType a, VectorType b) { + return combine(zipLow(a, b), zipHigh(a, b)); +} +#endif + +template +struct Unaligned { + template + SI T load(const P* p) { + T v; + memcpy(&v, p, sizeof(v)); + return v; + } + + template + SI void store(P* p, T v) { + memcpy(p, &v, sizeof(v)); + } +}; + +#ifndef __clang__ +template +struct Unaligned> { + template + SI VectorType load(const P* p) { + VectorType v; + memcpy(v.elements, p, sizeof(v)); + return v; + } + + template + SI void store(P* p, VectorType v) { + memcpy(p, v.elements, sizeof(v)); + } +}; +#endif + +template +SI T unaligned_load(const P* p) { + return Unaligned::load(p); +} + +template +SI void unaligned_store(P* p, T v) { + Unaligned::store(p, v); +} + +template +SI D bit_cast(const S& src) { + static_assert(sizeof(D) == sizeof(S), ""); + return unaligned_load(&src); +} + +template +using V2 = VectorType; +template +using V4 = VectorType; +using Float = V4; +using I32 = V4; +using I16 = V4; +using U64 = V4; +using U32 = V4; +using U16 = V4; +using U8 = V4; +using Bool = V4; +template +using V8 = VectorType; +template +using V16 = VectorType; + +} // namespace glsl -- cgit v1.2.3