1 files changed, 3119 insertions, 0 deletions
diff --git a/gfx/wr/swgl/src/glsl.h b/gfx/wr/swgl/src/glsl.h
new file mode 100644
index 0000000000..9193c72424
--- /dev/null
+++ b/gfx/wr/swgl/src/glsl.h
@@ -0,0 +1,3119 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#define SI ALWAYS_INLINE static
+
+#include "vector_type.h"
+
+namespace glsl {
+
+enum TextureFormat { RGBA32F, RGBA32I, RGBA8, R8, RG8, R16, RG16, YUV422 };
+
+enum TextureFilter { NEAREST, LINEAR };
+
+struct samplerCommon {
+  uint32_t* buf = nullptr;
+  uint32_t stride = 0;  // in units of BPP if < 4, or dwords if BPP >= 4
+  uint32_t height = 0;
+  uint32_t width = 0;
+  TextureFormat format = TextureFormat::RGBA8;
+};
+
+struct samplerFilter {
+  TextureFilter filter = TextureFilter::NEAREST;
+};
+
+struct sampler2D_impl : samplerCommon, samplerFilter {};
+typedef sampler2D_impl* sampler2D;
+
+typedef struct sampler2DR8_impl : sampler2D_impl{} * sampler2DR8;
+typedef struct sampler2DRG8_impl : sampler2D_impl{} * sampler2DRG8;
+typedef struct sampler2DRGBA8_impl : sampler2D_impl{} * sampler2DRGBA8;
+typedef struct sampler2DRGBA32F_impl : sampler2D_impl{} * sampler2DRGBA32F;
+
+struct isampler2D_impl : samplerCommon {};
+typedef isampler2D_impl* isampler2D;
+
+struct isampler2DRGBA32I_impl : isampler2D_impl {};
+typedef isampler2DRGBA32I_impl* isampler2DRGBA32I;
+
+struct sampler2DRect_impl : samplerCommon, samplerFilter {};
+typedef sampler2DRect_impl* sampler2DRect;
+
+#if USE_SSE2
+SI bool test_all(Bool cond) { return _mm_movemask_ps(cond) == 0xF; }
+SI bool test_any(Bool cond) { return _mm_movemask_ps(cond) != 0; }
+SI bool test_none(Bool cond) { return _mm_movemask_ps(cond) == 0; }
+#else
+SI bool test_all(Bool cond) {
+  return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0xFFFFFFFFU;
+}
+SI bool test_any(Bool cond) {
+  return bit_cast<uint32_t>(CONVERT(cond, U8)) != 0;
+}
+SI bool test_none(Bool cond) {
+  return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0;
+}
+#endif
+SI bool test_equal(Bool cond) { return test_none(cond != cond.x); }
+
+float make_float(float n) { return n; }
+
+float make_float(int32_t n) { return float(n); }
+
+float make_float(uint32_t n) { return float(n); }
+
+float make_float(bool n) { return float(n); }
+
+template <typename T>
+Float make_float(T v) {
+  return CONVERT(v, Float);
+}
+
+int32_t make_int(uint32_t n) { return n; }
+
+int32_t make_int(int32_t n) { return n; }
+
+int32_t make_int(float n) { return int32_t(n); }
+
+int32_t make_int(bool n) { return int32_t(n); }
+
+template <typename T>
+I32 make_int(T v) {
+  return CONVERT(v, I32);
+}
+
+uint32_t make_uint(uint32_t n) { return n; }
+
+uint32_t make_uint(int32_t n) { return n; }
+
+uint32_t make_uint(float n) { return uint32_t(n); }
+
+uint32_t make_uint(bool n) { return uint32_t(n); }
+
+template <typename T>
+U32 make_uint(T v) {
+  return CONVERT(v, U32);
+}
+
+template <typename T>
+T force_scalar(T n) {
+  return n;
+}
+
+float force_scalar(Float f) { return f[0]; }
+
+int32_t force_scalar(I32 i) { return i[0]; }
+
+struct vec4;
+struct ivec2;
+
+SI int32_t if_then_else(int32_t c, int32_t t, int32_t e) { return c ? t : e; }
+SI int32_t if_then_else(bool c, int32_t t, int32_t e) { return c ? t : e; }
+
+SI float if_then_else(int32_t c, float t, float e) { return c ? t : e; }
+
+SI Float if_then_else(I32 c, float t, float e) {
+  return bit_cast<Float>((c & bit_cast<I32>(Float(t))) |
+                         (~c & bit_cast<I32>(Float(e))));
+}
+
+SI I32 if_then_else(I32 c, int32_t t, int32_t e) {
+  return (c & I32(t)) | (~c & I32(e));
+}
+
+SI U32 if_then_else(I32 c, U32 t, U32 e) {
+  return bit_cast<U32>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e)));
+}
+
+// Cheaper version of if_then_else that returns Float(0) if condition is false.
+SI Float if_then(I32 c, Float t) {
+  return bit_cast<Float>(c & bit_cast<I32>(t));
+}
+
+SI Float if_then_else(I32 c, Float t, Float e) {
+  return bit_cast<Float>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e)));
+}
+
+SI Float if_then_else(int32_t c, Float t, Float e) { return c ? t : e; }
+
+SI Bool if_then_else(I32 c, Bool t, Bool e) { return (c & t) | (~c & e); }
+
+SI Bool if_then_else(int32_t c, Bool t, Bool e) { return c ? t : e; }
+
+SI I16 if_then_else(I16 c, I16 t, I16 e) { return (c & t) | (~c & e); }
+
+template <typename T>
+SI void swap(T& a, T& b) {
+  T t(a);
+  a = b;
+  b = t;
+}
+
+SI int32_t min(int32_t a, int32_t b) { return a < b ? a : b; }
+SI int32_t max(int32_t a, int32_t b) { return a > b ? a : b; }
+
+SI int32_t clamp(int32_t a, int32_t minVal, int32_t maxVal) {
+  return min(max(a, minVal), maxVal);
+}
+
+SI float min(float a, float b) { return a < b ? a : b; }
+SI float max(float a, float b) { return a > b ? a : b; }
+
+SI float clamp(float a, float minVal, float maxVal) {
+  return min(max(a, minVal), maxVal);
+}
+
+SI Float min(Float a, Float b) {
+#if USE_SSE2
+  return _mm_min_ps(a, b);
+#elif USE_NEON
+  return vminq_f32(a, b);
+#else
+  return if_then_else(a < b, a, b);
+#endif
+}
+
+SI Float max(Float a, Float b) {
+#if USE_SSE2
+  return _mm_max_ps(a, b);
+#elif USE_NEON
+  return vmaxq_f32(a, b);
+#else
+  return if_then_else(a > b, a, b);
+#endif
+}
+
+SI Float clamp(Float a, Float minVal, Float maxVal) {
+  return min(max(a, minVal), maxVal);
+}
+
+#define sqrt __glsl_sqrt
+
+SI float sqrt(float x) { return sqrtf(x); }
+
+SI Float sqrt(Float v) {
+#if USE_SSE2
+  return _mm_sqrt_ps(v);
+#elif USE_NEON
+  Float e = vrsqrteq_f32(v);
+  e *= vrsqrtsq_f32(v, e * e);
+  e *= vrsqrtsq_f32(v, e * e);
+  return if_then(v != Float(0.0f), v * e);
+#else
+  return (Float){sqrtf(v.x), sqrtf(v.y), sqrtf(v.z), sqrtf(v.w)};
+#endif
+}
+
+SI float recip(float x) {
+#if USE_SSE2
+  return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x)));
+#else
+  return 1.0f / x;
+#endif
+}
+
+// Use a fast vector reciprocal approximation when available. This should only
+// be used in cases where it is okay that the approximation is imprecise -
+// essentially visually correct but numerically wrong. Otherwise just rely on
+// however the compiler would implement slower division if the platform doesn't
+// provide a convenient intrinsic.
+SI Float recip(Float v) {
+#if USE_SSE2
+  return _mm_rcp_ps(v);
+#elif USE_NEON
+  Float e = vrecpeq_f32(v);
+  return vrecpsq_f32(v, e) * e;
+#else
+  return 1.0f / v;
+#endif
+}
+
+SI float inversesqrt(float x) {
+#if USE_SSE2
+  return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x)));
+#else
+  return 1.0f / sqrtf(x);
+#endif
+}
+
+SI Float inversesqrt(Float v) {
+#if USE_SSE2
+  return _mm_rsqrt_ps(v);
+#elif USE_NEON
+  Float e = vrsqrteq_f32(v);
+  return vrsqrtsq_f32(v, e * e) * e;
+#else
+  return 1.0f / sqrt(v);
+#endif
+}
+
+SI float step(float edge, float x) { return float(x >= edge); }
+
+SI Float step(Float edge, Float x) { return if_then(x >= edge, Float(1)); }
+
+/*
+enum RGBA {
+        R,
+        G,
+        B,
+        A
+};*/
+
+enum XYZW {
+  X = 0,
+  Y = 1,
+  Z = 2,
+  W = 3,
+  R = 0,
+  G = 1,
+  B = 2,
+  A = 3,
+};
+
+struct bvec4_scalar;
+
+struct bvec2_scalar {
+  bool x;
+  bool y;
+
+  bvec2_scalar() : bvec2_scalar(false) {}
+  IMPLICIT constexpr bvec2_scalar(bool a) : x(a), y(a) {}
+  constexpr bvec2_scalar(bool x, bool y) : x(x), y(y) {}
+
+  bool& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      default:
+        UNREACHABLE;
+    }
+  }
+  bool sel(XYZW c1) { return select(c1); }
+
+  bvec2_scalar sel(XYZW c1, XYZW c2) {
+    return bvec2_scalar(select(c1), select(c2));
+  }
+  bvec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
+};
+
+struct bvec2_scalar1 {
+  bool x;
+
+  IMPLICIT constexpr bvec2_scalar1(bool a) : x(a) {}
+
+  operator bvec2_scalar() const { return bvec2_scalar(x); }
+};
+
+struct bvec2 {
+  bvec2() : bvec2(0) {}
+  IMPLICIT bvec2(Bool a) : x(a), y(a) {}
+  bvec2(Bool x, Bool y) : x(x), y(y) {}
+  Bool& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      default:
+        UNREACHABLE;
+    }
+  }
+  Bool sel(XYZW c1) { return select(c1); }
+
+  bvec2 sel(XYZW c1, XYZW c2) { return bvec2(select(c1), select(c2)); }
+
+  bvec2 operator~() { return bvec2(~x, ~y); }
+
+  Bool x;
+  Bool y;
+};
+
+bvec2_scalar1 make_bvec2(bool n) { return bvec2_scalar1(n); }
+
+bvec2_scalar make_bvec2(bool x, bool y) { return bvec2_scalar{x, y}; }
+
+template <typename N>
+bvec2 make_bvec2(const N& n) {
+  return bvec2(n);
+}
+
+template <typename X, typename Y>
+bvec2 make_bvec2(const X& x, const Y& y) {
+  return bvec2(x, y);
+}
+
+struct vec3_scalar;
+struct vec4_scalar;
+
+struct vec2_scalar {
+  typedef struct vec2 vector_type;
+  typedef float element_type;
+
+  float x;
+  float y;
+
+  constexpr vec2_scalar() : vec2_scalar(0.0f) {}
+  IMPLICIT constexpr vec2_scalar(float a) : x(a), y(a) {}
+  IMPLICIT constexpr vec2_scalar(int a) : x(a), y(a) {}
+  constexpr vec2_scalar(float x, float y) : x(x), y(y) {}
+
+  float& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      default:
+        UNREACHABLE;
+    }
+  }
+  float& sel(XYZW c1) { return select(c1); }
+  vec2_scalar sel(XYZW c1, XYZW c2) {
+    return vec2_scalar(select(c1), select(c2));
+  }
+  vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3);
+  vec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
+
+  friend bool operator==(const vec2_scalar& l, const vec2_scalar& r) {
+    return l.x == r.x && l.y == r.y;
+  }
+
+  friend bool operator!=(const vec2_scalar& l, const vec2_scalar& r) {
+    return l.x != r.x || l.y != r.y;
+  }
+
+  friend vec2_scalar operator*(float a, vec2_scalar b) {
+    return vec2_scalar(a * b.x, a * b.y);
+  }
+  friend vec2_scalar operator*(vec2_scalar a, float b) {
+    return vec2_scalar(a.x * b, a.y * b);
+  }
+  friend vec2_scalar operator*(vec2_scalar a, vec2_scalar b) {
+    return vec2_scalar(a.x * b.x, a.y * b.y);
+  }
+  friend vec2_scalar operator/(vec2_scalar a, float b) {
+    return vec2_scalar(a.x / b, a.y / b);
+  }
+  friend vec2_scalar operator/(vec2_scalar a, vec2_scalar b) {
+    return vec2_scalar(a.x / b.x, a.y / b.y);
+  }
+
+  friend vec2_scalar operator-(vec2_scalar a, vec2_scalar b) {
+    return vec2_scalar(a.x - b.x, a.y - b.y);
+  }
+  friend vec2_scalar operator-(vec2_scalar a, float b) {
+    return vec2_scalar(a.x - b, a.y - b);
+  }
+  friend vec2_scalar operator-(float a, vec2_scalar b) {
+    return vec2_scalar(a - b.x, a - b.y);
+  }
+  friend vec2_scalar operator+(vec2_scalar a, vec2_scalar b) {
+    return vec2_scalar(a.x + b.x, a.y + b.y);
+  }
+  friend vec2_scalar operator+(vec2_scalar a, float b) {
+    return vec2_scalar(a.x + b, a.y + b);
+  }
+
+  vec2_scalar operator-() { return vec2_scalar(-x, -y); }
+
+  vec2_scalar operator*=(vec2_scalar a) {
+    x *= a.x;
+    y *= a.y;
+    return *this;
+  }
+
+  vec2_scalar operator/=(vec2_scalar a) {
+    x /= a.x;
+    y /= a.y;
+    return *this;
+  }
+
+  vec2_scalar operator+=(vec2_scalar a) {
+    x += a.x;
+    y += a.y;
+    return *this;
+  }
+
+  vec2_scalar operator-=(vec2_scalar a) {
+    x -= a.x;
+    y -= a.y;
+    return *this;
+  }
+};
+
+struct vec2_scalar_ref {
+  vec2_scalar_ref(float& x, float& y) : x(x), y(y) {}
+  float& x;
+  float& y;
+
+  float& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      default:
+        UNREACHABLE;
+    }
+  }
+  float& sel(XYZW c1) { return select(c1); }
+
+  vec2_scalar_ref& operator=(const vec2_scalar& a) {
+    x = a.x;
+    y = a.y;
+    return *this;
+  }
+  vec2_scalar_ref& operator*=(vec2_scalar a) {
+    x *= a.x;
+    y *= a.y;
+    return *this;
+  }
+  operator vec2_scalar() const { return vec2_scalar{x, y}; }
+};
+
+struct vec2 {
+  typedef struct vec2 vector_type;
+  typedef float element_type;
+
+  constexpr vec2() : vec2(Float(0.0f)) {}
+  IMPLICIT constexpr vec2(Float a) : x(a), y(a) {}
+  vec2(Float x, Float y) : x(x), y(y) {}
+  IMPLICIT constexpr vec2(vec2_scalar s) : x(s.x), y(s.y) {}
+  constexpr vec2(vec2_scalar s0, vec2_scalar s1, vec2_scalar s2, vec2_scalar s3)
+      : x(Float{s0.x, s1.x, s2.x, s3.x}), y(Float{s0.y, s1.y, s2.y, s3.y}) {}
+  explicit vec2(ivec2 a);
+  Float x;
+  Float y;
+
+  Float& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      default:
+        UNREACHABLE;
+    }
+  }
+  Float& sel(XYZW c1) { return select(c1); }
+  vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); }
+
+  vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
+
+  vec2 operator*=(Float a) {
+    x *= a;
+    y *= a;
+    return *this;
+  }
+  vec2 operator*=(vec2 a) {
+    x *= a.x;
+    y *= a.y;
+    return *this;
+  }
+
+  vec2 operator/=(Float a) {
+    x /= a;
+    y /= a;
+    return *this;
+  }
+  vec2 operator/=(vec2 a) {
+    x /= a.x;
+    y /= a.y;
+    return *this;
+  }
+
+  vec2 operator+=(vec2 a) {
+    x += a.x;
+    y += a.y;
+    return *this;
+  }
+  vec2 operator-=(vec2 a) {
+    x -= a.x;
+    y -= a.y;
+    return *this;
+  }
+  vec2 operator-=(Float a) {
+    x -= a;
+    y -= a;
+    return *this;
+  }
+
+  vec2 operator-() { return vec2(-x, -y); }
+
+  friend I32 operator==(const vec2& l, const vec2& r) {
+    return l.x == r.x && l.y == r.y;
+  }
+
+  friend I32 operator!=(const vec2& l, const vec2& r) {
+    return l.x != r.x || l.y != r.y;
+  }
+
+  friend vec2 operator*(vec2 a, Float b) { return vec2(a.x * b, a.y * b); }
+  friend vec2 operator*(vec2 a, vec2 b) { return vec2(a.x * b.x, a.y * b.y); }
+  friend vec2 operator*(Float a, vec2 b) { return vec2(a * b.x, a * b.y); }
+
+  friend vec2 operator/(vec2 a, vec2 b) { return vec2(a.x / b.x, a.y / b.y); }
+  friend vec2 operator/(vec2 a, Float b) { return vec2(a.x / b, a.y / b); }
+
+  friend vec2 operator-(vec2 a, vec2 b) { return vec2(a.x - b.x, a.y - b.y); }
+  friend vec2 operator-(vec2 a, Float b) { return vec2(a.x - b, a.y - b); }
+  friend vec2 operator-(Float a, vec2 b) { return vec2(a - b.x, a - b.y); }
+  friend vec2 operator+(vec2 a, vec2 b) { return vec2(a.x + b.x, a.y + b.y); }
+  friend vec2 operator+(vec2 a, Float b) { return vec2(a.x + b, a.y + b); }
+  friend vec2 operator+(Float a, vec2 b) { return vec2(a + b.x, a + b.y); }
+};
+
+vec2_scalar force_scalar(const vec2& v) {
+  return vec2_scalar{force_scalar(v.x), force_scalar(v.y)};
+}
+
+vec2_scalar make_vec2(float n) { return vec2_scalar{n, n}; }
+
+vec2_scalar make_vec2(float x, float y) { return vec2_scalar{x, y}; }
+
+vec2_scalar make_vec2(int32_t x, int32_t y) {
+  return vec2_scalar{float(x), float(y)};
+}
+
+template <typename N>
+vec2 make_vec2(const N& n) {
+  return vec2(n);
+}
+
+template <typename X, typename Y>
+vec2 make_vec2(const X& x, const Y& y) {
+  return vec2(x, y);
+}
+
+vec2 operator*(vec2_scalar a, Float b) { return vec2(a.x * b, a.y * b); }
+
+vec2 operator*(Float a, vec2_scalar b) { return vec2(a * b.x, a * b.y); }
+
+SI vec2 min(vec2 a, vec2 b) { return vec2(min(a.x, b.x), min(a.y, b.y)); }
+SI vec2 min(vec2 a, Float b) { return vec2(min(a.x, b), min(a.y, b)); }
+
+SI vec2_scalar min(vec2_scalar a, vec2_scalar b) {
+  return vec2_scalar{min(a.x, b.x), min(a.y, b.y)};
+}
+
+SI vec2 if_then_else(I32 c, vec2 t, vec2 e) {
+  return vec2(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y));
+}
+
+SI vec2 if_then_else(int32_t c, vec2 t, vec2 e) { return c ? t : e; }
+
+vec2 step(vec2 edge, vec2 x) {
+  return vec2(step(edge.x, x.x), step(edge.y, x.y));
+}
+
+vec2_scalar step(vec2_scalar edge, vec2_scalar x) {
+  return vec2_scalar(step(edge.x, x.x), step(edge.y, x.y));
+}
+
+SI vec2 max(vec2 a, vec2 b) { return vec2(max(a.x, b.x), max(a.y, b.y)); }
+SI vec2 max(vec2 a, Float b) { return vec2(max(a.x, b), max(a.y, b)); }
+
+SI vec2_scalar max(vec2_scalar a, vec2_scalar b) {
+  return vec2_scalar{max(a.x, b.x), max(a.y, b.y)};
+}
+SI vec2_scalar max(vec2_scalar a, float b) {
+  return vec2_scalar{max(a.x, b), max(a.y, b)};
+}
+
+Float length(vec2 a) { return sqrt(a.x * a.x + a.y * a.y); }
+
+float length(vec2_scalar a) { return hypotf(a.x, a.y); }
+
+template <typename A, typename B>
+SI auto distance(A a, B b) {
+  return length(a - b);
+}
+
+template <typename T>
+SI T normalize(T a) {
+  return a / length(a);
+}
+
+SI vec2 sqrt(vec2 a) { return vec2(sqrt(a.x), sqrt(a.y)); }
+
+SI vec2_scalar sqrt(vec2_scalar a) { return vec2_scalar(sqrt(a.x), sqrt(a.y)); }
+
+SI vec2 recip(vec2 a) { return vec2(recip(a.x), recip(a.y)); }
+
+SI vec2_scalar recip(vec2_scalar a) {
+  return vec2_scalar(recip(a.x), recip(a.y));
+}
+
+SI vec2 inversesqrt(vec2 a) { return vec2(inversesqrt(a.x), inversesqrt(a.y)); }
+
+SI vec2_scalar inversesqrt(vec2_scalar a) {
+  return vec2_scalar(inversesqrt(a.x), inversesqrt(a.y));
+}
+
+#define abs __glsl_abs
+
+int32_t abs(int32_t a) { return a < 0 ? -a : a; }
+
+float abs(float a) { return fabsf(a); }
+
+Float abs(Float v) {
+#if USE_NEON
+  return vabsq_f32(v);
+#else
+  return bit_cast<Float>(bit_cast<I32>(v) & bit_cast<I32>(0.0f - v));
+#endif
+}
+
+float sign(float a) { return copysignf(1.0f, a); }
+
+Float sign(Float v) {
+  return bit_cast<Float>((bit_cast<I32>(v) & 0x80000000) |
+                         bit_cast<I32>(Float(1.0f)));
+}
+
+Float cast(U32 v) { return CONVERT((I32)v, Float); }
+Float cast(I32 v) { return CONVERT((I32)v, Float); }
+I32 cast(Float v) { return CONVERT(v, I32); }
+
+#define floor __glsl_floor
+
+float floor(float a) { return floorf(a); }
+
+Float floor(Float v) {
+  Float roundtrip = cast(cast(v));
+  return roundtrip - if_then(roundtrip > v, Float(1));
+}
+
+vec2 floor(vec2 v) { return vec2(floor(v.x), floor(v.y)); }
+
+vec2_scalar floor(vec2_scalar v) {
+  return vec2_scalar{floorf(v.x), floorf(v.y)};
+}
+
+#define ceil __glsl_ceil
+
+float ceil(float a) { return ceilf(a); }
+
+Float ceil(Float v) {
+  Float roundtrip = cast(cast(v));
+  return roundtrip + if_then(roundtrip < v, Float(1));
+}
+
+// Round to nearest even
+SI int32_t roundeven(float v, float scale) {
+#if USE_SSE2
+  return _mm_cvtss_si32(_mm_set_ss(v * scale));
+#else
+  return bit_cast<int32_t>(v * scale + float(0xC00000)) - 0x4B400000;
+#endif
+}
+
+SI I32 roundeven(Float v, Float scale) {
+#if USE_SSE2
+  return _mm_cvtps_epi32(v * scale);
+#else
+  // Magic number implementation of round-to-nearest-even
+  // see http://stereopsis.com/sree/fpu2006.html
+  return bit_cast<I32>(v * scale + Float(0xC00000)) - 0x4B400000;
+#endif
+}
+
+// Round towards zero
+SI int32_t roundzero(float v, float scale) { return int32_t(v * scale); }
+
+SI I32 roundzero(Float v, Float scale) { return cast(v * scale); }
+
+// Round whichever direction is fastest for positive numbers
+SI I32 roundfast(Float v, Float scale) {
+#if USE_SSE2
+  return _mm_cvtps_epi32(v * scale);
+#else
+  return cast(v * scale + 0.5f);
+#endif
+}
+
+template <typename T>
+SI auto round_pixel(T v, float scale = 255.0f) {
+  return roundfast(v, scale);
+}
+
+#define round __glsl_round
+
+float round(float a) { return roundf(a); }
+
+Float round(Float v) { return floor(v + 0.5f); }
+
+float fract(float a) { return a - floor(a); }
+
+Float fract(Float v) { return v - floor(v); }
+
+vec2 fract(vec2 v) { return vec2(fract(v.x), fract(v.y)); }
+
+vec2_scalar fract(vec2_scalar v) { return vec2_scalar(fract(v.x), fract(v.y)); }
+
+// X derivatives can be approximated by dFdx(x) = x[1] - x[0].
+// Y derivatives are not easily available since we operate in terms of X spans
+// only. To work around, assume dFdy(p.x) = dFdx(p.y), which only holds for
+// uniform scaling, and thus abs(dFdx(p.x)) + abs(dFdy(p.x)) = abs(dFdx(p.x)) +
+// abs(dFdx(p.y)) which mirrors abs(dFdx(p.y)) + abs(dFdy(p.y)) = abs(dFdx(p.y))
+// + abs(dFdx(p.x)).
+vec2_scalar fwidth(vec2 p) {
+  Float d = abs(SHUFFLE(p.x, p.y, 1, 1, 5, 5) - SHUFFLE(p.x, p.y, 0, 0, 4, 4));
+  return vec2_scalar(d.x + d.z);
+}
+
+float dFdx(Float x) { return x.y - x.x; }
+
+vec2_scalar dFdx(vec2 p) { return vec2_scalar(dFdx(p.x), dFdx(p.y)); }
+
+// See
+// http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
+Float approx_log2(Float x) {
+  // e - 127 is a fair approximation of log2(x) in its own right...
+  Float e = cast(bit_cast<U32>(x)) * (1.0f / (1 << 23));
+
+  // ... but using the mantissa to refine its error is _much_ better.
+  Float m = bit_cast<Float>((bit_cast<U32>(x) & 0x007fffff) | 0x3f000000);
+  return e - 124.225514990f - 1.498030302f * m -
+         1.725879990f / (0.3520887068f + m);
+}
+
+Float approx_pow2(Float x) {
+  Float f = fract(x);
+  return bit_cast<Float>(
+      roundfast(1.0f * (1 << 23), x + 121.274057500f - 1.490129070f * f +
+                                      27.728023300f / (4.84252568f - f)));
+}
+
+#define pow __glsl_pow
+
+SI float pow(float x, float y) { return powf(x, y); }
+
+Float pow(Float x, Float y) {
+  return if_then_else((x == 0) | (x == 1), x, approx_pow2(approx_log2(x) * y));
+}
+
+#define exp __glsl_exp
+
+SI float exp(float x) { return expf(x); }
+
+Float exp(Float y) {
+  float l2e = 1.4426950408889634074f;
+  return approx_pow2(l2e * y);
+}
+
+#define exp2 __glsl_exp2
+
+SI float exp2(float x) { return exp2f(x); }
+
+Float exp2(Float x) { return approx_pow2(x); }
+
+#define log __glsl_log
+
+SI float log(float x) { return logf(x); }
+
+Float log(Float x) { return approx_log2(x) * 0.69314718f; }
+
+#define log2 __glsl_log2
+
+SI float log2(float x) { return log2f(x); }
+
+Float log2(Float x) { return approx_log2(x); }
+
+struct ivec4;
+
+struct ivec2_scalar {
+  typedef int32_t element_type;
+
+  int32_t x;
+  int32_t y;
+
+  ivec2_scalar() : ivec2_scalar(0) {}
+  IMPLICIT constexpr ivec2_scalar(int32_t a) : x(a), y(a) {}
+  constexpr ivec2_scalar(int32_t x, int32_t y) : x(x), y(y) {}
+
+  int32_t& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      default:
+        UNREACHABLE;
+    }
+  }
+  int32_t& sel(XYZW c1) { return select(c1); }
+  ivec2_scalar sel(XYZW c1, XYZW c2) {
+    return ivec2_scalar{select(c1), select(c2)};
+  }
+
+  ivec2_scalar operator-() const { return ivec2_scalar{-x, -y}; }
+
+  ivec2_scalar& operator+=(ivec2_scalar a) {
+    x += a.x;
+    y += a.y;
+    return *this;
+  }
+  ivec2_scalar& operator+=(int n) {
+    x += n;
+    y += n;
+    return *this;
+  }
+
+  ivec2_scalar& operator>>=(int shift) {
+    x >>= shift;
+    y >>= shift;
+    return *this;
+  }
+
+  friend ivec2_scalar operator&(ivec2_scalar a, int b) {
+    return ivec2_scalar{a.x & b, a.y & b};
+  }
+
+  friend ivec2_scalar operator+(ivec2_scalar a, ivec2_scalar b) {
+    return ivec2_scalar{a.x + b.x, a.y + b.y};
+  }
+  friend ivec2_scalar operator+(ivec2_scalar a, int b) {
+    return ivec2_scalar{a.x + b, a.y + b};
+  }
+
+  friend ivec2_scalar operator-(ivec2_scalar a, ivec2_scalar b) {
+    return ivec2_scalar{a.x - b.x, a.y - b.y};
+  }
+  friend ivec2_scalar operator-(ivec2_scalar a, int b) {
+    return ivec2_scalar{a.x - b, a.y - b};
+  }
+
+  friend bool operator==(const ivec2_scalar& l, const ivec2_scalar& r) {
+    return l.x == r.x && l.y == r.y;
+  }
+};
+
+struct ivec2 {
+  typedef int32_t element_type;
+
+  ivec2() : ivec2(I32(0)) {}
+  IMPLICIT ivec2(I32 a) : x(a), y(a) {}
+  ivec2(I32 x, I32 y) : x(x), y(y) {}
+  IMPLICIT ivec2(vec2 a) : x(cast(a.x)), y(cast(a.y)) {}
+  ivec2(U32 x, U32 y) : x(CONVERT(x, I32)), y(CONVERT(y, I32)) {}
+  IMPLICIT constexpr ivec2(ivec2_scalar s) : x(s.x), y(s.y) {}
+  constexpr ivec2(ivec2_scalar s0, ivec2_scalar s1, ivec2_scalar s2,
+                  ivec2_scalar s3)
+      : x(I32{s0.x, s1.x, s2.x, s3.x}), y(I32{s0.y, s1.y, s2.y, s3.y}) {}
+  I32 x;
+  I32 y;
+
+  I32& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      default:
+        UNREACHABLE;
+    }
+  }
+  I32& sel(XYZW c1) { return select(c1); }
+
+  ivec2 sel(XYZW c1, XYZW c2) { return ivec2(select(c1), select(c2)); }
+
+  ivec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
+
+  ivec2& operator*=(I32 a) {
+    x *= a;
+    y *= a;
+    return *this;
+  }
+  ivec2& operator+=(ivec2 a) {
+    x += a.x;
+    y += a.y;
+    return *this;
+  }
+  ivec2& operator>>=(int shift) {
+    x >>= shift;
+    y >>= shift;
+    return *this;
+  }
+
+  friend ivec2 operator*(ivec2 a, I32 b) { return ivec2(a.x * b, a.y * b); }
+  friend ivec2 operator&(ivec2 a, ivec2 b) {
+    return ivec2(a.x & b.x, a.y & b.y);
+  }
+  friend ivec2 operator&(ivec2 a, I32 b) { return ivec2(a.x & b, a.y & b); }
+  friend ivec2 operator+(ivec2 a, ivec2 b) {
+    return ivec2(a.x + b.x, a.y + b.y);
+  }
+};
+
+vec2::vec2(ivec2 a) : x(cast(a.x)), y(cast(a.y)) {}
+
+ivec2_scalar make_ivec2(int32_t n) { return ivec2_scalar{n, n}; }
+
+ivec2_scalar make_ivec2(uint32_t n) {
+  return ivec2_scalar{int32_t(n), int32_t(n)};
+}
+
+ivec2_scalar make_ivec2(int32_t x, int32_t y) { return ivec2_scalar{x, y}; }
+
+ivec2_scalar make_ivec2(uint32_t x, uint32_t y) {
+  return ivec2_scalar{int32_t(x), int32_t(y)};
+}
+
+vec2_scalar make_vec2(const ivec2_scalar& v) {
+  return vec2_scalar{float(v.x), float(v.y)};
+}
+
+ivec2_scalar make_ivec2(const vec2_scalar& v) {
+  return ivec2_scalar{int32_t(v.x), int32_t(v.y)};
+}
+
+template <typename N>
+ivec2 make_ivec2(const N& n) {
+  return ivec2(n);
+}
+
+template <typename X, typename Y>
+ivec2 make_ivec2(const X& x, const Y& y) {
+  return ivec2(x, y);
+}
+
+ivec2_scalar force_scalar(const ivec2& v) {
+  return ivec2_scalar{force_scalar(v.x), force_scalar(v.y)};
+}
+
+struct ivec3_scalar {
+  int32_t x;
+  int32_t y;
+  int32_t z;
+
+  ivec3_scalar() : ivec3_scalar(0) {}
+  IMPLICIT constexpr ivec3_scalar(int32_t a) : x(a), y(a), z(a) {}
+  constexpr ivec3_scalar(int32_t x, int32_t y, int32_t z) : x(x), y(y), z(z) {}
+
+  int32_t& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      default:
+        UNREACHABLE;
+    }
+  }
+  int32_t& sel(XYZW c1) { return select(c1); }
+  ivec2_scalar sel(XYZW c1, XYZW c2) {
+    return ivec2_scalar{select(c1), select(c2)};
+  }
+};
+
+struct ivec3 {
+  ivec3() : ivec3(0) {}
+  IMPLICIT ivec3(I32 a) : x(a), y(a), z(a) {}
+  ivec3(I32 x, I32 y, I32 z) : x(x), y(y), z(z) {}
+  ivec3(ivec2 a, I32 b) : x(a.x), y(a.y), z(b) {}
+  ivec3(vec2 a, Float b) : x(cast(a.x)), y(cast(a.y)), z(cast(b)) {}
+  I32 x;
+  I32 y;
+  I32 z;
+
+  friend ivec3 operator+(ivec3 a, ivec3 b) {
+    return ivec3(a.x + b.x, a.y + b.y, a.z + b.z);
+  }
+};
+
+vec2_scalar make_vec2(ivec3_scalar s) {
+  return vec2_scalar{float(s.x), float(s.y)};
+}
+
+ivec3_scalar make_ivec3(int32_t n) { return ivec3_scalar{n, n, n}; }
+
+ivec3_scalar make_ivec3(const ivec2_scalar& v, int32_t z) {
+  return ivec3_scalar{v.x, v.y, z};
+}
+
+ivec3_scalar make_ivec3(int32_t x, int32_t y, int32_t z) {
+  return ivec3_scalar{x, y, z};
+}
+
+template <typename N>
+ivec3 make_ivec3(const N& n) {
+  return ivec3(n);
+}
+
+template <typename X, typename Y>
+ivec3 make_ivec3(const X& x, const Y& y) {
+  return ivec3(x, y);
+}
+
+template <typename X, typename Y, typename Z>
+ivec3 make_ivec3(const X& x, const Y& y, const Z& z) {
+  return ivec3(x, y, z);
+}
+
+struct ivec4_scalar {
+  typedef int32_t element_type;
+
+  int32_t x;
+  int32_t y;
+  int32_t z;
+  int32_t w;
+
+  ivec4_scalar() : ivec4_scalar(0) {}
+  IMPLICIT constexpr ivec4_scalar(int32_t a) : x(a), y(a), z(a), w(a) {}
+  constexpr ivec4_scalar(int32_t x, int32_t y, int32_t z, int32_t w)
+      : x(x), y(y), z(z), w(w) {}
+
+  int32_t& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      case W:
+        return w;
+      default:
+        UNREACHABLE;
+    }
+  }
+  int32_t& sel(XYZW c1) { return select(c1); }
+  ivec2_scalar sel(XYZW c1, XYZW c2) {
+    return ivec2_scalar{select(c1), select(c2)};
+  }
+
+  friend ivec4_scalar operator&(int32_t a, ivec4_scalar b) {
+    return ivec4_scalar{a & b.x, a & b.y, a & b.z, a & b.w};
+  }
+  friend ivec4_scalar operator<<(ivec4_scalar a, int32_t b) {
+    return ivec4_scalar{a.x << b, a.y << b, a.z << b, a.w << b};
+  }
+
+  int32_t& operator[](int index) {
+    switch (index) {
+      case 0:
+        return x;
+      case 1:
+        return y;
+      case 2:
+        return z;
+      case 3:
+        return w;
+      default:
+        UNREACHABLE;
+    }
+  }
+};
+
+struct ivec4 {
+  typedef int32_t element_type;
+
+  ivec4() : ivec4(I32(0)) {}
+  IMPLICIT ivec4(I32 a) : x(a), y(a), z(a), w(a) {}
+  ivec4(I32 x, I32 y, I32 z, I32 w) : x(x), y(y), z(z), w(w) {}
+  ivec4(ivec2 a, I32 b, I32 c) : x(a.x), y(a.y), z(b), w(c) {}
+  IMPLICIT constexpr ivec4(ivec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
+  constexpr ivec4(ivec4_scalar s0, ivec4_scalar s1, ivec4_scalar s2,
+                  ivec4_scalar s3)
+      : x(I32{s0.x, s1.x, s2.x, s3.x}),
+        y(I32{s0.y, s1.y, s2.y, s3.y}),
+        z(I32{s0.z, s1.z, s2.z, s3.z}),
+        w(I32{s0.w, s1.w, s2.w, s3.w}) {}
+
+  I32& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      case W:
+        return w;
+      default:
+        UNREACHABLE;
+    }
+  }
+  I32 sel(XYZW c1) { return select(c1); }
+
+  ivec2 sel(XYZW c1, XYZW c2) { return ivec2(select(c1), select(c2)); }
+
+  ivec3 sel(XYZW c1, XYZW c2, XYZW c3) {
+    return ivec3(select(c1), select(c2), select(c3));
+  }
+
+  friend ivec4 operator&(I32 a, ivec4 b) {
+    return ivec4(a & b.x, a & b.y, a & b.z, a & b.w);
+  }
+
+  I32 x;
+  I32 y;
+  I32 z;
+  I32 w;
+};
+
+ivec4_scalar force_scalar(const ivec4& v) {
+  return ivec4_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z),
+                      force_scalar(v.w)};
+}
+
+ivec4_scalar make_ivec4(int32_t n) { return ivec4_scalar{n, n, n, n}; }
+
+ivec4_scalar make_ivec4(const ivec2_scalar& xy, int32_t z, int32_t w) {
+  return ivec4_scalar{xy.x, xy.y, z, w};
+}
+
+ivec4_scalar make_ivec4(int32_t x, int32_t y, int32_t z, int32_t w) {
+  return ivec4_scalar{x, y, z, w};
+}
+
+template <typename N>
+ivec4 make_ivec4(const N& n) {
+  return ivec4(n);
+}
+
+template <typename X, typename Y, typename Z>
+ivec4 make_ivec4(const X& x, const Y& y, const Z& z) {
+  return ivec4(x, y, z);
+}
+
+template <typename X, typename Y, typename Z, typename W>
+ivec4 make_ivec4(const X& x, const Y& y, const Z& z, const W& w) {
+  return ivec4(x, y, z, w);
+}
+
+SI ivec2 if_then_else(I32 c, ivec2 t, ivec2 e) {
+  return ivec2(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y));
+}
+
+SI ivec2 if_then_else(int32_t c, ivec2 t, ivec2 e) { return c ? t : e; }
+
+SI ivec4 if_then_else(I32 c, ivec4 t, ivec4 e) {
+  return ivec4(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y),
+               if_then_else(c, t.z, e.z), if_then_else(c, t.w, e.w));
+}
+
+SI ivec4 if_then_else(int32_t c, ivec4 t, ivec4 e) { return c ? t : e; }
+
+ivec4 operator&(I32 a, ivec4_scalar b) {
+  return ivec4(a & b.x, a & b.y, a & b.z, a & b.w);
+}
+
+struct bvec3_scalar {
+  bool x;
+  bool y;
+  bool z;
+
+  bvec3_scalar() : bvec3_scalar(false) {}
+  IMPLICIT constexpr bvec3_scalar(bool a) : x(a), y(a), z(a) {}
+  constexpr bvec3_scalar(bool x, bool y, bool z) : x(x), y(y), z(z) {}
+};
+
+struct bvec3_scalar1 {
+  bool x;
+
+  IMPLICIT constexpr bvec3_scalar1(bool a) : x(a) {}
+
+  operator bvec3_scalar() const { return bvec3_scalar(x); }
+};
+
+struct bvec3 {
+  bvec3() : bvec3(0) {}
+  IMPLICIT bvec3(Bool a) : x(a), y(a), z(a) {}
+  bvec3(Bool x, Bool y, Bool z) : x(x), y(y), z(z) {}
+  Bool& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      default:
+        UNREACHABLE;
+    }
+  }
+  Bool sel(XYZW c1) { return select(c1); }
+
+  Bool x;
+  Bool y;
+  Bool z;
+};
+
+bvec3_scalar1 make_bvec3(bool n) { return bvec3_scalar1(n); }
+
+struct bvec4_scalar {
+  bool x;
+  bool y;
+  bool z;
+  bool w;
+
+  bvec4_scalar() : bvec4_scalar(false) {}
+  IMPLICIT constexpr bvec4_scalar(bool a) : x(a), y(a), z(a), w(a) {}
+  constexpr bvec4_scalar(bool x, bool y, bool z, bool w)
+      : x(x), y(y), z(z), w(w) {}
+
+  bool& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      case W:
+        return w;
+      default:
+        UNREACHABLE;
+    }
+  }
+  bool sel(XYZW c1) { return select(c1); }
+  bvec2_scalar sel(XYZW c1, XYZW c2) {
+    return bvec2_scalar(select(c1), select(c2));
+  }
+};
+
+bvec4_scalar bvec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+  return bvec4_scalar{select(c1), select(c2), select(c3), select(c4)};
+}
+
+struct bvec4_scalar1 {
+  bool x;
+
+  IMPLICIT constexpr bvec4_scalar1(bool a) : x(a) {}
+
+  operator bvec4_scalar() const { return bvec4_scalar(x); }
+};
+
+struct bvec4 {
+  bvec4() : bvec4(0) {}
+  IMPLICIT bvec4(Bool a) : x(a), y(a), z(a), w(a) {}
+  bvec4(Bool x, Bool y, Bool z, Bool w) : x(x), y(y), z(z), w(w) {}
+  bvec4(bvec2 x, bvec2 y) : x(x.x), y(x.y), z(y.x), w(y.y) {}
+  Bool& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      case W:
+        return w;
+      default:
+        UNREACHABLE;
+    }
+  }
+  Bool sel(XYZW c1) { return select(c1); }
+
+  Bool x;
+  Bool y;
+  Bool z;
+  Bool w;
+};
+
+bvec4_scalar1 make_bvec4(bool n) { return bvec4_scalar1(n); }
+
+bvec4_scalar make_bvec4(bool x, bool y, bool z, bool w) {
+  return bvec4_scalar{x, y, z, w};
+}
+
+bvec4_scalar make_bvec4(bvec2_scalar a, bvec2_scalar b) {
+  return bvec4_scalar{a.x, a.y, b.x, b.y};
+}
+
+template <typename N>
+bvec4 make_bvec4(const N& n) {
+  return bvec4(n);
+}
+
+template <typename X, typename Y>
+bvec4 make_bvec4(const X& x, const Y& y) {
+  return bvec4(x, y);
+}
+
+template <typename X, typename Y, typename Z, typename W>
+bvec4 make_bvec4(const X& x, const Y& y, const Z& z, const W& w) {
+  return bvec4(x, y, z, w);
+}
+
+struct vec2_ref {
+  vec2_ref(Float& x, Float& y) : x(x), y(y) {}
+  Float& x;
+  Float& y;
+
+  Float& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      default:
+        UNREACHABLE;
+    }
+  }
+  Float& sel(XYZW c1) { return select(c1); }
+
+  vec2_ref& operator=(const vec2& a) {
+    x = a.x;
+    y = a.y;
+    return *this;
+  }
+
+  vec2_ref& operator/=(Float a) {
+    x /= a;
+    y /= a;
+    return *this;
+  }
+
+  vec2_ref& operator/=(vec2 a) {
+    x /= a.x;
+    y /= a.y;
+    return *this;
+  }
+
+  vec2_ref& operator+=(vec2 a) {
+    x += a.x;
+    y += a.y;
+    return *this;
+  }
+  vec2_ref& operator-=(vec2 a) {
+    x -= a.x;
+    y -= a.y;
+    return *this;
+  }
+  vec2_ref& operator*=(vec2 a) {
+    x *= a.x;
+    y *= a.y;
+    return *this;
+  }
+};
+
+struct vec3_scalar {
+  typedef struct vec3 vector_type;
+  typedef float element_type;
+
+  float x;
+  float y;
+  float z;
+
+  constexpr vec3_scalar() : vec3_scalar(0.0f) {}
+  IMPLICIT constexpr vec3_scalar(float a) : x(a), y(a), z(a) {}
+  constexpr vec3_scalar(float x, float y, float z) : x(x), y(y), z(z) {}
+
+  float& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      default:
+        UNREACHABLE;
+    }
+  }
+  float& sel(XYZW c1) { return select(c1); }
+  vec2_scalar sel(XYZW c1, XYZW c2) {
+    return vec2_scalar(select(c1), select(c2));
+  }
+  vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3) {
+    return vec3_scalar(select(c1), select(c2), select(c3));
+  }
+  vec2_scalar_ref lsel(XYZW c1, XYZW c2) {
+    return vec2_scalar_ref(select(c1), select(c2));
+  }
+
+  friend vec3_scalar operator*(vec3_scalar a, vec3_scalar b) {
+    return vec3_scalar{a.x * b.x, a.y * b.y, a.z * b.z};
+  }
+  friend vec3_scalar operator*(vec3_scalar a, float b) {
+    return vec3_scalar{a.x * b, a.y * b, a.z * b};
+  }
+
+  friend vec3_scalar operator-(vec3_scalar a, vec3_scalar b) {
+    return vec3_scalar{a.x - b.x, a.y - b.y, a.z - b.z};
+  }
+  friend vec3_scalar operator-(vec3_scalar a, float b) {
+    return vec3_scalar{a.x - b, a.y - b, a.z - b};
+  }
+  friend vec3_scalar operator+(vec3_scalar a, vec3_scalar b) {
+    return vec3_scalar{a.x + b.x, a.y + b.y, a.z + b.z};
+  }
+  friend vec3_scalar operator+(vec3_scalar a, float b) {
+    return vec3_scalar{a.x + b, a.y + b, a.z + b};
+  }
+
+  friend vec3_scalar operator/(vec3_scalar a, vec3_scalar b) {
+    return vec3_scalar{a.x / b.x, a.y / b.y, a.z / b.z};
+  }
+  friend vec3_scalar operator/(vec3_scalar a, float b) {
+    return vec3_scalar{a.x / b, a.y / b, a.z / b};
+  }
+
+  vec3_scalar operator+=(vec3_scalar a) {
+    x += a.x;
+    y += a.y;
+    z += a.z;
+    return *this;
+  }
+
+  friend bool operator==(const vec3_scalar& l, const vec3_scalar& r) {
+    return l.x == r.x && l.y == r.y && l.z == r.z;
+  }
+};
+
+struct vec3_scalar_ref {
+  vec3_scalar_ref(float& x, float& y, float& z) : x(x), y(y), z(z) {}
+  float& x;
+  float& y;
+  float& z;
+
+  float& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      default:
+        UNREACHABLE;
+    }
+  }
+  float& sel(XYZW c1) { return select(c1); }
+
+  vec3_scalar_ref& operator=(const vec3_scalar& a) {
+    x = a.x;
+    y = a.y;
+    z = a.z;
+    return *this;
+  }
+
+  operator vec3_scalar() const { return vec3_scalar{x, y, z}; }
+};
+
+struct vec3 {
+  typedef struct vec3 vector_type;
+  typedef float element_type;
+
+  constexpr vec3() : vec3(Float(0.0f)) {}
+  IMPLICIT constexpr vec3(Float a) : x(a), y(a), z(a) {}
+  constexpr vec3(Float x, Float y, Float z) : x(x), y(y), z(z) {}
+  vec3(vec2 a, Float z) : x(a.x), y(a.y), z(z) {}
+  explicit vec3(vec4);
+  IMPLICIT constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {}
+  constexpr vec3(vec3_scalar s0, vec3_scalar s1, vec3_scalar s2, vec3_scalar s3)
+      : x(Float{s0.x, s1.x, s2.x, s3.x}),
+        y(Float{s0.y, s1.y, s2.y, s3.y}),
+        z(Float{s0.z, s1.z, s2.z, s3.z}) {}
+  Float x;
+  Float y;
+  Float z;
+
+  Float& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      default:
+        UNREACHABLE;
+    }
+  }
+  Float& sel(XYZW c1) { return select(c1); }
+
+  vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); }
+
+  vec3 sel(XYZW c1, XYZW c2, XYZW c3) {
+    return vec3(select(c1), select(c2), select(c3));
+  }
+
+  vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
+
+  vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); }
+
+  friend vec3 operator*(vec3 a, Float b) {
+    return vec3(a.x * b, a.y * b, a.z * b);
+  }
+  friend vec3 operator*(vec3 a, vec3 b) {
+    return vec3(a.x * b.x, a.y * b.y, a.z * b.z);
+  }
+  friend vec3 operator*(Float a, vec3 b) {
+    return vec3(a * b.x, a * b.y, a * b.z);
+  }
+
+  friend vec3 operator/(vec3 a, Float b) {
+    return vec3(a.x / b, a.y / b, a.z / b);
+  }
+  friend vec3 operator/(vec3 a, vec3 b) {
+    return vec3(a.x / b.x, a.y / b.y, a.z / b.z);
+  }
+
+  friend I32 operator==(const vec3& l, const vec3& r) {
+    return l.x == r.x && l.y == r.y && l.z == r.z;
+  }
+
+  friend vec3 operator-(vec3 a, Float b) {
+    return vec3(a.x - b, a.y - b, a.z - b);
+  }
+  friend vec3 operator-(vec3 a, vec3 b) {
+    return vec3(a.x - b.x, a.y - b.y, a.z - b.z);
+  }
+  friend vec3 operator+(vec3 a, Float b) {
+    return vec3(a.x + b, a.y + b, a.z + b);
+  }
+  friend vec3 operator+(vec3 a, vec3 b) {
+    return vec3(a.x + b.x, a.y + b.y, a.z + b.z);
+  }
+
+  vec3 operator+=(vec3_scalar a) {
+    x += a.x;
+    y += a.y;
+    z += a.z;
+    return *this;
+  }
+  vec3& operator+=(vec3 a) {
+    x += a.x;
+    y += a.y;
+    z += a.z;
+    return *this;
+  }
+};
+
+vec3_scalar force_scalar(const vec3& v) {
+  return vec3_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z)};
+}
+
+vec3_scalar make_vec3(float n) { return vec3_scalar{n, n, n}; }
+
+vec3_scalar make_vec3(const vec2_scalar& v, float z) {
+  return vec3_scalar{v.x, v.y, z};
+}
+
+vec3_scalar make_vec3(float x, float y, float z) {
+  return vec3_scalar{x, y, z};
+}
+
+vec3_scalar make_vec3(int32_t x, int32_t y, float z) {
+  return vec3_scalar{float(x), float(y), z};
+}
+
+template <typename N>
+vec3 make_vec3(const N& n) {
+  return vec3(n);
+}
+
+template <typename X, typename Y>
+vec3 make_vec3(const X& x, const Y& y) {
+  return vec3(x, y);
+}
+
+template <typename X, typename Y, typename Z>
+vec3 make_vec3(const X& x, const Y& y, const Z& z) {
+  return vec3(x, y, z);
+}
+
+SI vec3 if_then_else(I32 c, vec3 t, vec3 e) {
+  return vec3(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y),
+              if_then_else(c, t.z, e.z));
+}
+
+SI vec3 if_then_else(int32_t c, vec3 t, vec3 e) { return c ? t : e; }
+
+SI vec3 if_then_else(ivec3 c, vec3 t, vec3 e) {
+  return vec3(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y),
+              if_then_else(c.z, t.z, e.z));
+}
+
+vec3 step(vec3 edge, vec3 x) {
+  return vec3(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z));
+}
+
+vec3_scalar step(vec3_scalar edge, vec3_scalar x) {
+  return vec3_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z));
+}
+
+SI vec3 min(vec3 a, vec3 b) {
+  return vec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+}
+SI vec3 min(vec3 a, Float b) {
+  return vec3(min(a.x, b), min(a.y, b), min(a.z, b));
+}
+SI vec3_scalar min(vec3_scalar a, vec3_scalar b) {
+  return vec3_scalar{min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)};
+}
+
+SI vec3 max(vec3 a, vec3 b) {
+  return vec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+}
+SI vec3 max(vec3 a, Float b) {
+  return vec3(max(a.x, b), max(a.y, b), max(a.z, b));
+}
+SI vec3_scalar max(vec3_scalar a, vec3_scalar b) {
+  return vec3_scalar{max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
+}
+
+vec3 pow(vec3 x, vec3 y) {
+  return vec3(pow(x.x, y.x), pow(x.y, y.y), pow(x.z, y.z));
+}
+
+struct vec3_ref {
+  vec3_ref(Float& x, Float& y, Float& z) : x(x), y(y), z(z) {}
+  Float& x;
+  Float& y;
+  Float& z;
+  vec3_ref& operator=(const vec3& a) {
+    x = a.x;
+    y = a.y;
+    z = a.z;
+    return *this;
+  }
+
+  vec3_ref& operator/=(Float a) {
+    x /= a;
+    y /= a;
+    z /= a;
+    return *this;
+  }
+
+  vec3_ref& operator*=(Float a) {
+    x *= a;
+    y *= a;
+    z *= a;
+    return *this;
+  }
+};
+
+struct vec4_scalar {
+  typedef struct vec4 vector_type;
+  typedef float element_type;
+
+  float x;
+  float y;
+  float z;
+  float w;
+
+  constexpr vec4_scalar() : vec4_scalar(0.0f) {}
+  IMPLICIT constexpr vec4_scalar(float a) : x(a), y(a), z(a), w(a) {}
+  constexpr vec4_scalar(float x, float y, float z, float w)
+      : x(x), y(y), z(z), w(w) {}
+  vec4_scalar(vec3_scalar xyz, float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+
+  static vec4_scalar load_from_ptr(const float* f) {
+    return vec4_scalar(f[0], f[1], f[2], f[3]);
+  }
+
+  ALWAYS_INLINE float& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      case W:
+        return w;
+      default:
+        UNREACHABLE;
+    }
+  }
+  float& sel(XYZW c1) { return select(c1); }
+  vec2_scalar sel(XYZW c1, XYZW c2) {
+    return vec2_scalar{select(c1), select(c2)};
+  }
+  vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3) {
+    return vec3_scalar{select(c1), select(c2), select(c3)};
+  }
+  vec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+    return vec4_scalar{select(c1), select(c2), select(c3), select(c4)};
+  }
+  vec2_scalar_ref lsel(XYZW c1, XYZW c2) {
+    return vec2_scalar_ref(select(c1), select(c2));
+  }
+  vec3_scalar_ref lsel(XYZW c1, XYZW c2, XYZW c3) {
+    return vec3_scalar_ref(select(c1), select(c2), select(c3));
+  }
+
+  friend vec4_scalar operator*(vec4_scalar a, vec4_scalar b) {
+    return vec4_scalar{a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w};
+  }
+  friend vec4_scalar operator*(vec4_scalar a, float b) {
+    return vec4_scalar{a.x * b, a.y * b, a.z * b, a.w * b};
+  }
+  friend vec4_scalar operator*(float a, vec4_scalar b) {
+    return vec4_scalar{a * b.x, a * b.y, a * b.z, a * b.w};
+  }
+  vec4_scalar& operator*=(float a) {
+    x *= a;
+    y *= a;
+    z *= a;
+    w *= a;
+    return *this;
+  }
+
+  friend vec4_scalar operator-(vec4_scalar a, vec4_scalar b) {
+    return vec4_scalar{a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w};
+  }
+  friend vec4_scalar operator-(vec4_scalar a, float b) {
+    return vec4_scalar{a.x - b, a.y - b, a.z - b, a.w - b};
+  }
+  friend vec4_scalar operator+(vec4_scalar a, vec4_scalar b) {
+    return vec4_scalar{a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w};
+  }
+  friend vec4_scalar operator+(vec4_scalar a, float b) {
+    return vec4_scalar{a.x + b, a.y + b, a.z + b, a.w + b};
+  }
+
+  friend vec4_scalar operator/(vec4_scalar a, vec4_scalar b) {
+    return vec4_scalar{a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w};
+  }
+  friend vec4_scalar operator/(vec4_scalar a, float b) {
+    return vec4_scalar{a.x / b, a.y / b, a.z / b, a.w / b};
+  }
+
+  vec4_scalar& operator+=(vec4_scalar a) {
+    x += a.x;
+    y += a.y;
+    z += a.z;
+    w += a.w;
+    return *this;
+  }
+
+  vec4_scalar& operator/=(vec4_scalar a) {
+    x /= a.x;
+    y /= a.y;
+    z /= a.z;
+    w /= a.w;
+    return *this;
+  }
+
+  vec4_scalar& operator*=(vec4_scalar a) {
+    x *= a.x;
+    y *= a.y;
+    z *= a.z;
+    w *= a.w;
+    return *this;
+  }
+
+  friend bool operator==(const vec4_scalar& l, const vec4_scalar& r) {
+    return l.x == r.x && l.y == r.y && l.z == r.z && l.w == r.w;
+  }
+
+  friend bool operator!=(const vec4_scalar& l, const vec4_scalar& r) {
+    return l.x != r.x || l.y != r.y || l.z != r.z || l.w != r.w;
+  }
+};
+
+vec3_scalar vec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3) {
+  return {select(c1), select(c2), select(c3)};
+}
+vec4_scalar vec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+  return vec4_scalar{select(c1), select(c2), select(c3), select(c4)};
+}
+
+struct vec4_ref {
+  vec4_ref(Float& x, Float& y, Float& z, Float& w) : x(x), y(y), z(z), w(w) {}
+  Float& x;
+  Float& y;
+  Float& z;
+  Float& w;
+
+  vec4_ref& operator=(const vec4& a);
+};
+
+struct vec4 {
+  typedef struct vec4 vector_type;
+  typedef float element_type;
+
+  constexpr vec4() : vec4(Float(0.0f)) {}
+  IMPLICIT constexpr vec4(Float a) : x(a), y(a), z(a), w(a) {}
+  vec4(Float x, Float y, Float z, Float w) : x(x), y(y), z(z), w(w) {}
+  vec4(vec3 xyz, Float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+  vec4(vec2 xy, vec2 zw) : x(xy.x), y(xy.y), z(zw.x), w(zw.y) {}
+  vec4(vec2 xy, Float z, Float w) : x(xy.x), y(xy.y), z(z), w(w) {}
+  vec4(Float x, Float y, vec2 zw) : x(x), y(y), z(zw.x), w(zw.y) {}
+  IMPLICIT constexpr vec4(vec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
+  constexpr vec4(vec4_scalar s0, vec4_scalar s1, vec4_scalar s2, vec4_scalar s3)
+      : x(Float{s0.x, s1.x, s2.x, s3.x}),
+        y(Float{s0.y, s1.y, s2.y, s3.y}),
+        z(Float{s0.z, s1.z, s2.z, s3.z}),
+        w(Float{s0.w, s1.w, s2.w, s3.w}) {}
+  ALWAYS_INLINE Float& select(XYZW c) {
+    switch (c) {
+      case X:
+        return x;
+      case Y:
+        return y;
+      case Z:
+        return z;
+      case W:
+        return w;
+      default:
+        UNREACHABLE;
+    }
+  }
+  ALWAYS_INLINE Float& sel(XYZW c1) { return select(c1); }
+
+  ALWAYS_INLINE vec2 sel(XYZW c1, XYZW c2) {
+    return vec2(select(c1), select(c2));
+  }
+
+  ALWAYS_INLINE vec3 sel(XYZW c1, XYZW c2, XYZW c3) {
+    return vec3(select(c1), select(c2), select(c3));
+  }
+  ALWAYS_INLINE vec3_ref lsel(XYZW c1, XYZW c2, XYZW c3) {
+    return vec3_ref(select(c1), select(c2), select(c3));
+  }
+
+  ALWAYS_INLINE vec2_ref lsel(XYZW c1, XYZW c2) {
+    return vec2_ref(select(c1), select(c2));
+  }
+
+  ALWAYS_INLINE vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+    return vec4(select(c1), select(c2), select(c3), select(c4));
+  }
+  ALWAYS_INLINE vec4_ref lsel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+    return vec4_ref(select(c1), select(c2), select(c3), select(c4));
+  }
+
+  Float& operator[](int index) {
+    switch (index) {
+      case 0:
+        return x;
+      case 1:
+        return y;
+      case 2:
+        return z;
+      case 3:
+        return w;
+      default:
+        UNREACHABLE;
+    }
+  }
+
+  // glsl supports non-const indexing of vecs.
+  // hlsl doesn't. The code it generates is probably not wonderful.
+  Float operator[](I32 index) {
+    float sel_x = 0;
+    switch (index.x) {
+      case 0:
+        sel_x = x.x;
+        break;
+      case 1:
+        sel_x = y.x;
+        break;
+      case 2:
+        sel_x = z.x;
+        break;
+      case 3:
+        sel_x = w.x;
+        break;
+    }
+    float sel_y = 0;
+    switch (index.y) {
+      case 0:
+        sel_y = x.y;
+        break;
+      case 1:
+        sel_y = y.y;
+        break;
+      case 2:
+        sel_y = z.y;
+        break;
+      case 3:
+        sel_y = w.y;
+        break;
+    }
+    float sel_z = 0;
+    switch (index.z) {
+      case 0:
+        sel_z = x.z;
+        break;
+      case 1:
+        sel_z = y.z;
+        break;
+      case 2:
+        sel_z = z.z;
+        break;
+      case 3:
+        sel_z = w.z;
+        break;
+    }
+    float sel_w = 0;
+    switch (index.w) {
+      case 0:
+        sel_w = x.w;
+        break;
+      case 1:
+        sel_w = y.w;
+        break;
+      case 2:
+        sel_w = z.w;
+        break;
+      case 3:
+        sel_w = w.w;
+        break;
+    }
+    Float ret = {sel_x, sel_y, sel_z, sel_w};
+    return ret;
+  }
+
+  friend vec4 operator/(vec4 a, Float b) {
+    return vec4(a.x / b, a.y / b, a.z / b, a.w / b);
+  }
+  friend vec4 operator/(vec4 a, vec4 b) {
+    return vec4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+  }
+
+  friend vec4 operator*(vec4 a, Float b) {
+    return vec4(a.x * b, a.y * b, a.z * b, a.w * b);
+  }
+
+  friend vec4 operator*(Float b, vec4 a) {
+    return vec4(a.x * b, a.y * b, a.z * b, a.w * b);
+  }
+  friend vec4 operator*(vec4 a, vec4 b) {
+    return vec4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+  }
+
+  friend vec4 operator-(vec4 a, vec4 b) {
+    return vec4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+  }
+  friend vec4 operator+(vec4 a, vec4 b) {
+    return vec4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+  }
+  vec4& operator+=(vec4 a) {
+    x += a.x;
+    y += a.y;
+    z += a.z;
+    w += a.w;
+    return *this;
+  }
+  vec4& operator/=(vec4 a) {
+    x /= a.x;
+    y /= a.y;
+    z /= a.z;
+    w /= a.w;
+    return *this;
+  }
+  vec4& operator*=(vec4 a) {
+    x *= a.x;
+    y *= a.y;
+    z *= a.z;
+    w *= a.w;
+    return *this;
+  }
+  vec4& operator*=(Float a) {
+    x *= a;
+    y *= a;
+    z *= a;
+    w *= a;
+    return *this;
+  }
+
+  Float x;
+  Float y;
+  Float z;
+  Float w;
+};
+
+inline vec4_ref& vec4_ref::operator=(const vec4& a) {
+  x = a.x;
+  y = a.y;
+  z = a.z;
+  w = a.w;
+  return *this;
+}
+
+inline vec4 vec3::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+  return vec4(select(c1), select(c2), select(c3), select(c4));
+}
+
+vec4_scalar force_scalar(const vec4& v) {
+  return vec4_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z),
+                     force_scalar(v.w)};
+}
+
+vec4_scalar make_vec4(float n) { return vec4_scalar{n, n, n, n}; }
+
+vec4_scalar make_vec4(const vec2_scalar& v, float z, float w) {
+  return vec4_scalar{v.x, v.y, z, w};
+}
+
+vec4_scalar make_vec4(const vec2_scalar& a, const vec2_scalar& b) {
+  return vec4_scalar{a.x, a.y, b.x, b.y};
+}
+
+vec4_scalar make_vec4(const vec3_scalar& v, float w) {
+  return vec4_scalar{v.x, v.y, v.z, w};
+}
+
+vec4_scalar make_vec4(float x, float y, float z, float w) {
+  return vec4_scalar{x, y, z, w};
+}
+
+vec4_scalar make_vec4(float x, float y, const vec2_scalar& v) {
+  return vec4_scalar{x, y, v.x, v.y};
+}
+
+ivec4_scalar make_ivec4(const vec4_scalar& v) {
+  return ivec4_scalar{int32_t(v.x), int32_t(v.y), int32_t(v.z), int32_t(v.w)};
+}
+
+template <typename N>
+vec4 make_vec4(const N& n) {
+  return vec4(n);
+}
+
+template <typename X, typename Y>
+vec4 make_vec4(const X& x, const Y& y) {
+  return vec4(x, y);
+}
+
+template <typename X, typename Y, typename Z>
+vec4 make_vec4(const X& x, const Y& y, const Z& z) {
+  return vec4(x, y, z);
+}
+
+template <typename X, typename Y, typename Z, typename W>
+vec4 make_vec4(const X& x, const Y& y, const Z& z, const W& w) {
+  return vec4(x, y, z, w);
+}
+
+vec4_scalar make_vec4(const ivec4_scalar& v) {
+  return vec4_scalar{float(v.x), float(v.y), float(v.z), float(v.w)};
+}
+
+ALWAYS_INLINE vec3::vec3(vec4 v) : x(v.x), y(v.y), z(v.z) {}
+
+SI ivec4 roundfast(vec4 v, Float scale) {
+  return ivec4(roundfast(v.x, scale), roundfast(v.y, scale),
+               roundfast(v.z, scale), roundfast(v.w, scale));
+}
+
+vec4 operator*(vec4_scalar a, Float b) {
+  return vec4(a.x * b, a.y * b, a.z * b, a.w * b);
+}
+
+SI vec4 if_then_else(I32 c, vec4 t, vec4 e) {
+  return vec4(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y),
+              if_then_else(c, t.z, e.z), if_then_else(c, t.w, e.w));
+}
+
+SI vec4 if_then_else(int32_t c, vec4 t, vec4 e) { return c ? t : e; }
+
+SI vec4_scalar if_then_else(int32_t c, vec4_scalar t, vec4_scalar e) {
+  return c ? t : e;
+}
+
+SI vec2 clamp(vec2 a, Float minVal, Float maxVal) {
+  return vec2(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal));
+}
+
+SI vec2 clamp(vec2 a, vec2 minVal, vec2 maxVal) {
+  return vec2(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y));
+}
+
+SI vec2_scalar clamp(vec2_scalar a, vec2_scalar minVal, vec2_scalar maxVal) {
+  return vec2_scalar{clamp(a.x, minVal.x, maxVal.x),
+                     clamp(a.y, minVal.y, maxVal.y)};
+}
+
+SI vec2_scalar clamp(vec2_scalar a, float minVal, float maxVal) {
+  return vec2_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal)};
+}
+
+SI I32 clamp(I32 a, I32 minVal, I32 maxVal) {
+  a = if_then_else(a < minVal, minVal, a);
+  return if_then_else(a > maxVal, maxVal, a);
+}
+
+SI vec3 clamp(vec3 a, Float minVal, Float maxVal) {
+  return vec3(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal),
+              clamp(a.z, minVal, maxVal));
+}
+
+SI vec3 clamp(vec3 a, vec3 minVal, vec3 maxVal) {
+  return vec3(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
+              clamp(a.z, minVal.z, maxVal.z));
+}
+
+SI vec4 clamp(vec4 a, Float minVal, Float maxVal) {
+  return vec4(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal),
+              clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal));
+}
+
+SI vec4 clamp(vec4 a, vec4 minVal, vec4 maxVal) {
+  return vec4(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
+              clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w));
+}
+
+SI vec4_scalar clamp(vec4_scalar a, vec4_scalar minVal, vec4_scalar maxVal) {
+  return vec4_scalar{
+      clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
+      clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w)};
+}
+
+SI vec4_scalar clamp(vec4_scalar a, float minVal, float maxVal) {
+  return vec4_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal),
+                     clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal)};
+}
+
+vec4 step(vec4 edge, vec4 x) {
+  return vec4(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z),
+              step(edge.w, x.w));
+}
+
+vec4_scalar step(vec4_scalar edge, vec4_scalar x) {
+  return vec4_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z),
+                     step(edge.w, x.w));
+}
+
+template <typename T>
+auto lessThanEqual(T x, T y) -> decltype(x <= y) {
+  return x <= y;
+}
+
+template <typename T>
+auto lessThan(T x, T y) -> decltype(x < y) {
+  return x < y;
+}
+
+SI bvec3 lessThanEqual(vec3 x, vec3 y) {
+  return bvec3(lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y),
+               lessThanEqual(x.z, y.z));
+}
+
+SI bvec2 lessThanEqual(vec2 x, vec2 y) {
+  return bvec2(lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y));
+}
+
+SI bvec2_scalar lessThanEqual(vec2_scalar x, vec2_scalar y) {
+  return bvec2_scalar{lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y)};
+}
+
+SI bvec4 lessThanEqual(vec4 x, vec4 y) {
+  return bvec4(lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y),
+               lessThanEqual(x.z, y.z), lessThanEqual(x.w, y.w));
+}
+
+SI bvec4_scalar lessThanEqual(vec4_scalar x, vec4_scalar y) {
+  return bvec4_scalar{lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y),
+                      lessThanEqual(x.z, y.z), lessThanEqual(x.w, y.w)};
+}
+
+SI bvec2 lessThan(vec2 x, vec2 y) {
+  return bvec2(lessThan(x.x, y.x), lessThan(x.y, y.y));
+}
+
+SI bvec2_scalar lessThan(vec2_scalar x, vec2_scalar y) {
+  return bvec2_scalar(lessThan(x.x, y.x), lessThan(x.y, y.y));
+}
+
+SI bvec4 lessThan(vec4 x, vec4 y) {
+  return bvec4(lessThan(x.x, y.x), lessThan(x.y, y.y), lessThan(x.z, y.z),
+               lessThan(x.w, y.w));
+}
+
+SI bvec4_scalar lessThan(vec4_scalar x, vec4_scalar y) {
+  return bvec4_scalar{lessThan(x.x, y.x), lessThan(x.y, y.y),
+                      lessThan(x.z, y.z), lessThan(x.w, y.w)};
+}
+
+template <typename T>
+auto greaterThan(T x, T y) -> decltype(x > y) {
+  return x > y;
+}
+
+bvec2 greaterThan(vec2 x, vec2 y) {
+  return bvec2(greaterThan(x.x, y.x), greaterThan(x.y, y.y));
+}
+
+bvec2_scalar greaterThan(vec2_scalar x, vec2_scalar y) {
+  return bvec2_scalar(greaterThan(x.x, y.x), greaterThan(x.y, y.y));
+}
+
+SI bvec4 greaterThan(vec4 x, vec4 y) {
+  return bvec4(greaterThan(x.x, y.x), greaterThan(x.y, y.y),
+               greaterThan(x.z, y.z), greaterThan(x.w, y.w));
+}
+
+SI bvec4_scalar greaterThan(vec4_scalar x, vec4_scalar y) {
+  return bvec4_scalar{greaterThan(x.x, y.x), greaterThan(x.y, y.y),
+                      greaterThan(x.z, y.z), greaterThan(x.w, y.w)};
+}
+
+template <typename T>
+auto greaterThanEqual(T x, T y) -> decltype(x >= y) {
+  return x >= y;
+}
+
+bvec4 greaterThanEqual(vec4 x, vec4 y) {
+  return bvec4(greaterThanEqual(x.x, y.x), greaterThanEqual(x.y, y.y),
+               greaterThanEqual(x.z, y.z), greaterThanEqual(x.w, y.w));
+}
+
+template <typename T>
+auto equal(T x, T y) -> decltype(x > y) {
+  return x == y;
+}
+
+bvec2 equal(vec2 x, vec2 y) { return bvec2(equal(x.x, y.x), equal(x.y, y.y)); }
+
+bvec2_scalar equal(vec2_scalar x, vec2_scalar y) {
+  return bvec2_scalar(equal(x.x, y.x), equal(x.y, y.y));
+}
+
+template <typename T>
+auto notEqual(T x, T y) -> decltype(x > y) {
+  return x != y;
+}
+
+bvec2 notEqual(vec2 x, vec2 y) {
+  return bvec2(notEqual(x.x, y.x), notEqual(x.y, y.y));
+}
+
+bvec2_scalar notEqual(vec2_scalar x, vec2_scalar y) {
+  return bvec2_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y));
+}
+
+struct mat4_scalar;
+
+struct mat2_scalar {
+  vec2_scalar data[2];
+
+  mat2_scalar() = default;
+  IMPLICIT constexpr mat2_scalar(float a)
+      : data{vec2_scalar(a), vec2_scalar(a)} {}
+  constexpr mat2_scalar(vec2_scalar a, vec2_scalar b) : data{a, b} {}
+  IMPLICIT mat2_scalar(const mat4_scalar& mat);
+
+  vec2_scalar& operator[](int index) { return data[index]; }
+  const vec2_scalar& operator[](int index) const { return data[index]; }
+
+  friend vec2_scalar operator*(mat2_scalar m, vec2_scalar v) {
+    vec2_scalar u;
+    u.x = m[0].x * v.x + m[1].x * v.y;
+    u.y = m[0].y * v.x + m[1].y * v.y;
+    return u;
+  }
+
+  friend vec2 operator*(mat2_scalar m, vec2 v) {
+    vec2 u;
+    u.x = m[0].x * v.x + m[1].x * v.y;
+    u.y = m[0].y * v.x + m[1].y * v.y;
+    return u;
+  }
+
+  friend mat2_scalar operator*(mat2_scalar m, float f) {
+    mat2_scalar u = m;
+    u[0].x *= f;
+    u[0].y *= f;
+    u[1].x *= f;
+    u[1].y *= f;
+    return u;
+  }
+};
+
+struct mat4;
+
+struct mat2 {
+  vec2 data[2];
+
+  vec2& operator[](int index) { return data[index]; }
+  const vec2& operator[](int index) const { return data[index]; }
+  mat2() = default;
+
+  IMPLICIT constexpr mat2(Float a) : data{vec2(a), vec2(a)} {}
+
+  constexpr mat2(vec2 a, vec2 b) : data{a, b} {}
+  IMPLICIT mat2(const mat4& mat);
+  IMPLICIT constexpr mat2(mat2_scalar s)
+      : data{vec2(s.data[0]), vec2(s.data[1])} {}
+
+  friend vec2 operator*(mat2 m, vec2 v) {
+    vec2 u;
+    u.x = m[0].x * v.x + m[1].x * v.y;
+    u.y = m[0].y * v.x + m[1].y * v.y;
+    return u;
+  }
+  friend mat2 operator*(mat2 m, Float f) {
+    mat2 u = m;
+    u[0].x *= f;
+    u[0].y *= f;
+    u[1].x *= f;
+    u[1].y *= f;
+    return u;
+  }
+};
+
+mat2_scalar make_mat2(float n) { return mat2_scalar{{n, n}, {n, n}}; }
+
+mat2_scalar make_mat2(const mat2_scalar& m) { return m; }
+
+mat2_scalar make_mat2(const vec2_scalar& x, const vec2_scalar& y) {
+  return mat2_scalar{x, y};
+}
+
+template <typename N>
+mat2 make_mat2(const N& n) {
+  return mat2(n);
+}
+
+template <typename X, typename Y>
+mat2 make_mat2(const X& x, const Y& y) {
+  return mat2(x, y);
+}
+
+SI mat2 if_then_else(I32 c, mat2 t, mat2 e) {
+  return mat2(if_then_else(c, t[0], e[0]), if_then_else(c, t[0], e[1]));
+}
+
+SI mat2 if_then_else(int32_t c, mat2 t, mat2 e) { return c ? t : e; }
+
+struct mat3_scalar {
+  vec3_scalar data[3];
+
+  mat3_scalar() = default;
+  constexpr mat3_scalar(vec3_scalar a, vec3_scalar b, vec3_scalar c)
+      : data{a, b, c} {}
+  IMPLICIT mat3_scalar(const mat4_scalar& mat);
+
+  vec3_scalar& operator[](int index) { return data[index]; }
+  const vec3_scalar& operator[](int index) const { return data[index]; }
+
+  friend vec3_scalar operator*(mat3_scalar m, vec3_scalar v) {
+    vec3_scalar u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z;
+    return u;
+  }
+
+  friend vec3 operator*(mat3_scalar m, vec3 v) {
+    vec3 u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z;
+    return u;
+  }
+
+  friend auto operator*(mat3_scalar a, mat3_scalar b) {
+    mat3_scalar r;
+    for (int c = 0; c < 3; c++) {
+      const auto& v = b[c];
+      r[c].x = a[0].x * v.x + a[1].x * v.y + a[2].x * v.z;
+      r[c].y = a[0].y * v.x + a[1].y * v.y + a[2].y * v.z;
+      r[c].z = a[0].z * v.x + a[1].z * v.y + a[2].z * v.z;
+    }
+    return r;
+  }
+};
+
+struct mat3 {
+  vec3 data[3];
+
+  vec3& operator[](int index) { return data[index]; }
+  const vec3& operator[](int index) const { return data[index]; }
+  mat3() = default;
+  constexpr mat3(vec3 a, vec3 b, vec3 c) : data{a, b, c} {}
+
+  IMPLICIT constexpr mat3(mat3_scalar s)
+      : data{vec3(s.data[0]), vec3(s.data[1]), vec3(s.data[2])} {}
+
+  constexpr mat3(mat3_scalar s0, mat3_scalar s1, mat3_scalar s2, mat3_scalar s3)
+      : data{vec3(s0.data[0], s1.data[0], s2.data[0], s3.data[0]),
+             vec3(s0.data[1], s1.data[1], s2.data[1], s3.data[1]),
+             vec3(s0.data[2], s1.data[2], s2.data[2], s3.data[2])} {}
+
+  constexpr mat3(Float d1, Float d2, Float d3, Float d4, Float d5, Float d6,
+                 Float d7, Float d8, Float d9)
+      : data{vec3(d1, d2, d3), vec3(d4, d5, d6), vec3(d7, d8, d9)} {}
+
+  IMPLICIT mat3(const mat4& mat);
+
+  friend vec3 operator*(mat3 m, vec3 v) {
+    vec3 u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z;
+    return u;
+  }
+};
+
+mat3_scalar force_scalar(const mat3& v) {
+  return mat3_scalar{force_scalar(v[0]), force_scalar(v[1]),
+                     force_scalar(v[2])};
+}
+
+mat3_scalar make_mat3(const mat3_scalar& m) { return m; }
+
+mat3_scalar make_mat3(const vec3_scalar& x, const vec3_scalar& y,
+                      const vec3_scalar& z) {
+  return mat3_scalar{x, y, z};
+}
+
+constexpr mat3_scalar make_mat3(float m0, float m1, float m2, float m3,
+                                float m4, float m5, float m6, float m7,
+                                float m8) {
+  return mat3_scalar{{m0, m1, m2}, {m3, m4, m5}, {m6, m7, m8}};
+}
+
+template <typename N>
+mat3 make_mat3(const N& n) {
+  return mat3(n);
+}
+
+template <typename X, typename Y, typename Z>
+mat3 make_mat3(const X& x, const Y& y, const Z& z) {
+  return mat3(x, y, z);
+}
+
+struct mat3x4_scalar {
+  vec4_scalar data[3];
+
+  mat3x4_scalar() = default;
+  constexpr mat3x4_scalar(vec4_scalar a, vec4_scalar b, vec4_scalar c)
+      : data{a, b, c} {}
+
+  auto& operator[](int index) { return data[index]; }
+  constexpr auto operator[](int index) const { return data[index]; }
+
+  friend auto operator*(mat3x4_scalar m, vec3_scalar v) {
+    vec4_scalar u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z;
+    u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z;
+    return u;
+  }
+
+  friend auto operator*(mat3x4_scalar m, vec3 v) {
+    vec4 u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z;
+    u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z;
+    return u;
+  }
+};
+
+constexpr mat3x4_scalar make_mat3x4(float m0, float m1, float m2, float m3,
+                                    float m4, float m5, float m6, float m7,
+                                    float m8, float m9, float m10, float m11) {
+  return mat3x4_scalar{
+      {m0, m1, m2, m3},
+      {m4, m5, m6, m7},
+      {m8, m9, m10, m11},
+  };
+}
+
+struct mat4x3_scalar {
+  vec3_scalar data[4];
+
+  mat4x3_scalar() = default;
+  constexpr mat4x3_scalar(vec3_scalar a, vec3_scalar b, vec3_scalar c,
+                          vec3_scalar d)
+      : data{a, b, c, d} {}
+
+  auto& operator[](int index) { return data[index]; }
+  constexpr auto operator[](int index) const { return data[index]; }
+
+  friend auto operator*(mat4x3_scalar m, vec4_scalar v) {
+    vec3_scalar u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w;
+    return u;
+  }
+
+  friend auto operator*(mat4x3_scalar m, vec4 v) {
+    vec3 u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w;
+    return u;
+  }
+};
+
+constexpr mat4x3_scalar transpose(const mat3x4_scalar m) {
+  return {{m[0].x, m[1].x, m[2].x},
+          {m[0].y, m[1].y, m[2].y},
+          {m[0].z, m[1].z, m[2].z},
+          {m[0].w, m[1].w, m[2].w}};
+}
+
+struct mat4_scalar {
+  vec4_scalar data[4];
+
+  mat4_scalar() = default;
+  constexpr mat4_scalar(vec4_scalar a, vec4_scalar b, vec4_scalar c,
+                        vec4_scalar d)
+      : data{a, b, c, d} {}
+
+  vec4_scalar& operator[](int index) { return data[index]; }
+  const vec4_scalar& operator[](int index) const { return data[index]; }
+
+  static mat4_scalar load_from_ptr(const float* f) {
+    return mat4_scalar(
+        vec4_scalar::load_from_ptr(&f[0]), vec4_scalar::load_from_ptr(&f[4]),
+        vec4_scalar::load_from_ptr(&f[8]), vec4_scalar::load_from_ptr(&f[12]));
+  }
+
+  friend vec4_scalar operator*(mat4_scalar m, vec4_scalar v) {
+    vec4_scalar u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w;
+    u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z + m[3].w * v.w;
+    return u;
+  }
+
+  friend vec4 operator*(mat4_scalar m, vec4 v) {
+    vec4 u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w;
+    u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z + m[3].w * v.w;
+    return u;
+  }
+};
+
+struct mat4 {
+  vec4 data[4];
+
+  mat4() = default;
+  IMPLICIT constexpr mat4(mat4_scalar s)
+      : data{vec4(s.data[0]), vec4(s.data[1]), vec4(s.data[2]),
+             vec4(s.data[3])} {}
+
+  constexpr mat4(vec4 a, vec4 b, vec4 c, vec4 d) : data{a, b, c, d} {}
+
+  vec4& operator[](int index) { return data[index]; }
+  const vec4& operator[](int index) const { return data[index]; }
+
+  friend vec4 operator*(mat4 m, vec4 v) {
+    vec4 u;
+    u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w;
+    u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w;
+    u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w;
+    u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z + m[3].w * v.w;
+    return u;
+  }
+};
+
+mat3::mat3(const mat4& mat)
+    : mat3(vec3(mat[0].x, mat[0].y, mat[0].z),
+           vec3(mat[1].x, mat[1].y, mat[1].z),
+           vec3(mat[2].x, mat[2].y, mat[2].z)) {}
+
+IMPLICIT mat3_scalar::mat3_scalar(const mat4_scalar& mat)
+    : mat3_scalar(vec3_scalar(mat[0].x, mat[0].y, mat[0].z),
+                  vec3_scalar(mat[1].x, mat[1].y, mat[1].z),
+                  vec3_scalar(mat[2].x, mat[2].y, mat[2].z)) {}
+
+IMPLICIT mat2::mat2(const mat4& mat)
+    : mat2(vec2(mat[0].x, mat[0].y), vec2(mat[1].x, mat[1].y)) {}
+
+IMPLICIT mat2_scalar::mat2_scalar(const mat4_scalar& mat)
+    : mat2_scalar(vec2_scalar(mat[0].x, mat[0].y),
+                  vec2_scalar(mat[1].x, mat[1].y)) {}
+
+mat2_scalar make_mat2(const mat4_scalar& m) { return mat2_scalar(m); }
+
+mat3_scalar make_mat3(const mat4_scalar& m) { return mat3_scalar(m); }
+
+mat4_scalar force_scalar(const mat4& v) {
+  return mat4_scalar(force_scalar(v[0]), force_scalar(v[1]), force_scalar(v[2]),
+                     force_scalar(v[3]));
+}
+
+mat4_scalar make_mat4(const mat4_scalar& m) { return m; }
+
+mat4_scalar make_mat4(const vec4_scalar& x, const vec4_scalar& y,
+                      const vec4_scalar& z, const vec4_scalar& w) {
+  return mat4_scalar{x, y, z, w};
+}
+
+constexpr mat4_scalar make_mat4(float m0, float m1, float m2, float m3,
+                                float m4, float m5, float m6, float m7,
+                                float m8, float m9, float m10, float m11,
+                                float m12, float m13, float m14, float m15) {
+  return mat4_scalar{{m0, m1, m2, m3},
+                     {m4, m5, m6, m7},
+                     {m8, m9, m10, m11},
+                     {m12, m13, m14, m15}};
+}
+
+template <typename N>
+mat4 make_mat4(const N& n) {
+  return mat4(n);
+}
+
+template <typename X, typename Y, typename Z, typename W>
+mat4 make_mat4(const X& x, const Y& y, const Z& z, const W& w) {
+  return mat4(x, y, z, w);
+}
+
+SI mat3 if_then_else(I32 c, mat3 t, mat3 e) {
+  return mat3{if_then_else(c, t[0], e[0]), if_then_else(c, t[1], e[1]),
+              if_then_else(c, t[2], e[2])};
+}
+
+SI mat3 if_then_else(int32_t c, mat3 t, mat3 e) { return c ? t : e; }
+
+SI mat4 if_then_else(I32 c, mat4 t, mat4 e) {
+  return mat4{if_then_else(c, t[0], e[0]), if_then_else(c, t[1], e[1]),
+              if_then_else(c, t[2], e[2]), if_then_else(c, t[3], e[3])};
+}
+
+SI mat4 if_then_else(int32_t c, mat4 t, mat4 e) { return c ? t : e; }
+
+template <typename T, typename U, typename A,
+          typename R = typename T::vector_type>
+SI R mix(T x, U y, A a) {
+  return (y - x) * a + x;
+}
+
+SI Float mix(Float x, Float y, Float a) { return (y - x) * a + x; }
+
+template <typename T>
+SI T mix(T x, T y, float a) {
+  return (y - x) * a + x;
+}
+
+template <typename T>
+SI T mix(T x, T y, vec2_scalar a) {
+  return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y)};
+}
+
+template <typename T>
+SI T mix(T x, T y, vec3_scalar a) {
+  return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z)};
+}
+
+template <typename T>
+SI T mix(T x, T y, vec4_scalar a) {
+  return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z),
+           mix(x.w, y.w, a.w)};
+}
+
+ivec4 ivec2::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+  return ivec4(select(c1), select(c2), select(c3), select(c4));
+}
+
+vec4 vec2::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+  return vec4(select(c1), select(c2), select(c3), select(c4));
+}
+
+bool any(bool x) { return x; }
+
+Bool any(bvec4 x) { return x.x | x.y | x.z | x.w; }
+
+bool any(bvec4_scalar x) { return x.x | x.y | x.z | x.w; }
+
+Bool any(bvec2 x) { return x.x | x.y; }
+
+bool any(bvec2_scalar x) { return x.x | x.y; }
+
+bool all(bool x) { return x; }
+
+Bool all(bvec2 x) { return x.x & x.y; }
+
+bool all(bvec2_scalar x) { return x.x & x.y; }
+
+Bool all(bvec4 x) { return x.x & x.y & x.z & x.w; }
+
+bool all(bvec4_scalar x) { return x.x & x.y & x.z & x.w; }
+
+SI vec4 if_then_else(bvec4 c, vec4 t, vec4 e) {
+  return vec4(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y),
+              if_then_else(c.z, t.z, e.z), if_then_else(c.w, t.w, e.w));
+}
+SI vec3 if_then_else(bvec3 c, vec3 t, vec3 e) {
+  return vec3(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y),
+              if_then_else(c.z, t.z, e.z));
+}
+
+SI vec2 if_then_else(bvec2 c, vec2 t, vec2 e) {
+  return vec2(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y));
+}
+
+template <typename T, typename R = typename T::vector_type>
+SI R mix(T x, T y, bvec4 a) {
+  return if_then_else(a, y, x);
+}
+
+template <typename T, typename R = typename T::vector_type>
+SI R mix(T x, T y, bvec3 a) {
+  return if_then_else(a, y, x);
+}
+
+template <typename T, typename R = typename T::vector_type>
+SI R mix(T x, T y, bvec2 a) {
+  return if_then_else(a, y, x);
+}
+
+template <typename T>
+SI T mix(T x, T y, bvec4_scalar a) {
+  return T{a.x ? y.x : x.x, a.y ? y.y : x.y, a.z ? y.z : x.z, a.w ? y.w : x.w};
+}
+
+template <typename T>
+SI T mix(T x, T y, bvec4_scalar1 a) {
+  return a.x ? y : x;
+}
+
+template <typename T>
+SI T mix(T x, T y, bvec3_scalar a) {
+  return T{a.x ? y.x : x.x, a.y ? y.y : x.y, a.z ? y.z : x.z};
+}
+
+template <typename T>
+SI T mix(T x, T y, bvec3_scalar1 a) {
+  return a.x ? y : x;
+}
+
+template <typename T>
+SI T mix(T x, T y, bvec2_scalar a) {
+  return T{a.x ? y.x : x.x, a.y ? y.y : x.y};
+}
+
+template <typename T>
+SI T mix(T x, T y, bvec2_scalar1 a) {
+  return a.x ? y : x;
+}
+
+float dot(vec3_scalar a, vec3_scalar b) {
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+Float dot(vec3 a, vec3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
+
+float dot(vec2_scalar a, vec2_scalar b) { return a.x * b.x + a.y * b.y; }
+
+Float dot(vec2 a, vec2 b) { return a.x * b.x + a.y * b.y; }
+
+#define sin __glsl_sin
+
+float sin(float x) { return sinf(x); }
+
+Float sin(Float v) { return {sinf(v.x), sinf(v.y), sinf(v.z), sinf(v.w)}; }
+
+#define cos __glsl_cos
+
+float cos(float x) { return cosf(x); }
+
+Float cos(Float v) { return {cosf(v.x), cosf(v.y), cosf(v.z), cosf(v.w)}; }
+
+#define tan __glsl_tan
+
+float tan(float x) { return tanf(x); }
+
+Float tan(Float v) { return {tanf(v.x), tanf(v.y), tanf(v.z), tanf(v.w)}; }
+
+#define atan __glsl_atan
+
+float atan(float x) { return atanf(x); }
+
+Float atan(Float v) { return {atanf(v.x), atanf(v.y), atanf(v.z), atanf(v.w)}; }
+
+float atan(float a, float b) { return atan2f(a, b); }
+
+Float atan(Float a, Float b) {
+  return {atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z),
+          atan2f(a.w, b.w)};
+}
+
+bvec4 equal(vec4 x, vec4 y) {
+  return bvec4(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z),
+               equal(x.w, y.w));
+}
+
+bvec4_scalar equal(vec4_scalar x, vec4_scalar y) {
+  return bvec4_scalar(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z),
+                      equal(x.w, y.w));
+}
+
+bvec4 notEqual(vec4 x, vec4 y) {
+  return bvec4(notEqual(x.x, y.x), notEqual(x.y, y.y), notEqual(x.z, y.z),
+               notEqual(x.w, y.w));
+}
+
+bvec4_scalar notEqual(vec4_scalar x, vec4_scalar y) {
+  return bvec4_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y),
+                      notEqual(x.z, y.z), notEqual(x.w, y.w));
+}
+
+bvec4 notEqual(ivec4 a, ivec4 b) {
+  return bvec4(a.x != b.x, a.y != b.y, a.z != b.z, a.w != b.w);
+}
+
+bvec4_scalar notEqual(ivec4_scalar a, ivec4_scalar b) {
+  return bvec4_scalar{a.x != b.x, a.y != b.y, a.z != b.z, a.w != b.w};
+}
+
+mat3 transpose(mat3 m) {
+  return mat3(vec3(m[0].x, m[1].x, m[2].x), vec3(m[0].y, m[1].y, m[2].y),
+              vec3(m[0].z, m[1].z, m[2].z));
+}
+
+mat3_scalar transpose(mat3_scalar m) {
+  return mat3_scalar{vec3_scalar(m[0].x, m[1].x, m[2].x),
+                     vec3_scalar(m[0].y, m[1].y, m[2].y),
+                     vec3_scalar(m[0].z, m[1].z, m[2].z)};
+}
+
+vec2 abs(vec2 v) { return vec2(abs(v.x), abs(v.y)); }
+
+vec2_scalar abs(vec2_scalar v) { return vec2_scalar{fabsf(v.x), fabsf(v.y)}; }
+
+vec2 sign(vec2 v) { return vec2(sign(v.x), sign(v.y)); }
+
+vec2_scalar sign(vec2_scalar v) { return vec2_scalar{sign(v.x), sign(v.y)}; }
+
+Float mod(Float a, Float b) { return a - b * floor(a / b); }
+
+vec2 mod(vec2 a, vec2 b) { return vec2(mod(a.x, b.x), mod(a.y, b.y)); }
+
+vec3 abs(vec3 v) { return vec3(abs(v.x), abs(v.y), abs(v.z)); }
+
+vec3 sign(vec3 v) { return vec3(sign(v.x), sign(v.y), sign(v.z)); }
+
+mat2 inverse(mat2 v) {
+  Float det = v[0].x * v[1].y - v[0].y * v[1].x;
+  return mat2(vec2(v[1].y, -v[0].y), vec2(-v[1].x, v[0].x)) * (1. / det);
+}
+
+mat2_scalar inverse(mat2_scalar v) {
+  float det = v[0].x * v[1].y - v[0].y * v[1].x;
+  return mat2_scalar{{v[1].y, -v[0].y}, {-v[1].x, v[0].x}} * (1. / det);
+}
+
+int32_t get_nth(I32 a, int n) { return a[n]; }
+
+float get_nth(Float a, int n) { return a[n]; }
+
+float get_nth(float a, int) { return a; }
+
+ivec2_scalar get_nth(ivec2 a, int n) { return ivec2_scalar{a.x[n], a.y[n]}; }
+
+vec2_scalar get_nth(vec2 a, int n) { return vec2_scalar{a.x[n], a.y[n]}; }
+
+vec3_scalar get_nth(vec3 a, int n) {
+  return vec3_scalar{a.x[n], a.y[n], a.z[n]};
+}
+
+vec4_scalar get_nth(vec4 a, int n) {
+  return vec4_scalar{a.x[n], a.y[n], a.z[n], a.w[n]};
+}
+
+ivec4_scalar get_nth(ivec4 a, int n) {
+  return ivec4_scalar{a.x[n], a.y[n], a.z[n], a.w[n]};
+}
+
+mat3_scalar get_nth(mat3 a, int n) {
+  return make_mat3(get_nth(a[0], n), get_nth(a[1], n), get_nth(a[2], n));
+}
+
+void put_nth(Float& dst, int n, float src) { dst[n] = src; }
+
+void put_nth(I32& dst, int n, int32_t src) { dst[n] = src; }
+
+void put_nth(ivec2& dst, int n, ivec2_scalar src) {
+  dst.x[n] = src.x;
+  dst.y[n] = src.y;
+}
+
+void put_nth(vec2& dst, int n, vec2_scalar src) {
+  dst.x[n] = src.x;
+  dst.y[n] = src.y;
+}
+
+void put_nth(vec3& dst, int n, vec3_scalar src) {
+  dst.x[n] = src.x;
+  dst.y[n] = src.y;
+  dst.z[n] = src.z;
+}
+
+void put_nth(ivec4& dst, int n, ivec4_scalar src) {
+  dst.x[n] = src.x;
+  dst.y[n] = src.y;
+  dst.z[n] = src.z;
+  dst.w[n] = src.w;
+}
+
+void put_nth(vec4& dst, int n, vec4_scalar src) {
+  dst.x[n] = src.x;
+  dst.y[n] = src.y;
+  dst.z[n] = src.z;
+  dst.w[n] = src.w;
+}
+
+// Use an ElementType type constructor
+// so that we can implement element_type for
+// Int and Float
+template <typename V>
+struct ElementType {
+  typedef typename V::element_type ty;
+};
+
+template <>
+struct ElementType<float> {
+  typedef float ty;
+};
+
+template <>
+struct ElementType<int> {
+  typedef float ty;
+};
+
+template <>
+struct ElementType<Float> {
+  typedef float ty;
+};
+
+template <>
+struct ElementType<I32> {
+  typedef int32_t ty;
+};
+
+void put_nth_component(ivec2_scalar& dst, int n, int32_t src) {
+  switch (n) {
+    case 0:
+      dst.x = src;
+      break;
+    case 1:
+      dst.y = src;
+      break;
+  }
+}
+
+void put_nth_component(ivec4_scalar& dst, int n, int32_t src) {
+  switch (n) {
+    case 0:
+      dst.x = src;
+      break;
+    case 1:
+      dst.y = src;
+      break;
+    case 2:
+      dst.z = src;
+      break;
+    case 3:
+      dst.w = src;
+      break;
+  }
+}
+
+void put_nth_component(int& dst, int n, int src) {
+  switch (n) {
+    case 0:
+      dst = src;
+      break;
+  }
+}
+
+void put_nth_component(float& dst, int n, float src) {
+  switch (n) {
+    case 0:
+      dst = src;
+      break;
+  }
+}
+
+void put_nth_component(vec2_scalar& dst, int n, float src) {
+  switch (n) {
+    case 0:
+      dst.x = src;
+      break;
+    case 1:
+      dst.y = src;
+      break;
+  }
+}
+
+void put_nth_component(vec3_scalar& dst, int n, float src) {
+  switch (n) {
+    case 0:
+      dst.x = src;
+      break;
+    case 1:
+      dst.y = src;
+      break;
+    case 2:
+      dst.z = src;
+      break;
+  }
+}
+
+void put_nth_component(vec4_scalar& dst, int n, float src) {
+  switch (n) {
+    case 0:
+      dst.x = src;
+      break;
+    case 1:
+      dst.y = src;
+      break;
+    case 2:
+      dst.z = src;
+      break;
+    case 3:
+      dst.w = src;
+      break;
+  }
+}
+
+Float init_interp(float init0, float step) {
+  float init1 = init0 + step;
+  float init2 = init1 + step;
+  float init3 = init2 + step;
+  return {init0, init1, init2, init3};
+}
+
+vec2 init_interp(vec2_scalar init, vec2_scalar step) {
+  return vec2(init_interp(init.x, step.x), init_interp(init.y, step.y));
+}
+
+vec3 init_interp(vec3_scalar init, vec3_scalar step) {
+  return vec3(init_interp(init.x, step.x), init_interp(init.y, step.y),
+              init_interp(init.z, step.z));
+}
+
+vec4 init_interp(vec4_scalar init, vec4_scalar step) {
+  return vec4(init_interp(init.x, step.x), init_interp(init.y, step.y),
+              init_interp(init.z, step.z), init_interp(init.w, step.w));
+}
+
+template <typename T, size_t N>
+struct Array {
+  T elements[N];
+  T& operator[](size_t i) { return elements[i]; }
+  const T& operator[](size_t i) const { return elements[i]; }
+  template <typename S>
+  void convert(const Array<S, N>& s) {
+    for (size_t i = 0; i < N; ++i) elements[i] = T(s[i]);
+  }
+};
+
+template <size_t SIZE>
+Array<vec2, SIZE> if_then_else(I32 c, Array<vec2, SIZE> t,
+                               Array<vec2, SIZE> e) {
+  Array<vec2, SIZE> r;
+  for (size_t i = 0; i < SIZE; i++) {
+    r[i] = if_then_else(c, t[i], e[i]);
+  }
+  return r;
+}
+
+}  // namespace glsl