1 files changed, 922 insertions, 0 deletions
diff --git a/gfx/wr/swgl/src/composite.h b/gfx/wr/swgl/src/composite.h
new file mode 100644
index 0000000000..a5a4489e6d
--- /dev/null
+++ b/gfx/wr/swgl/src/composite.h
@@ -0,0 +1,922 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+template <typename P>
+static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth,
+                             int span, int frac) {
+  for (P* end = dst + span; dst < end; dst++) {
+    *dst = *src;
+    // Step source according to width ratio.
+    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
+      src++;
+    }
+  }
+}
+
+static NO_INLINE void scale_blit(Texture& srctex, const IntRect& srcReq,
+                                 int srcZ, Texture& dsttex,
+                                 const IntRect& dstReq, int dstZ, bool invertY,
+                                 const IntRect& clipRect) {
+  // Cache scaling ratios
+  int srcWidth = srcReq.width();
+  int srcHeight = srcReq.height();
+  int dstWidth = dstReq.width();
+  int dstHeight = dstReq.height();
+  // Compute valid dest bounds
+  IntRect dstBounds = dsttex.sample_bounds(dstReq);
+  // Compute valid source bounds
+  // Scale source to dest, rounding inward to avoid sampling outside source
+  IntRect srcBounds = srctex.sample_bounds(srcReq, invertY).scale(
+      srcWidth, srcHeight, dstWidth, dstHeight, true);
+  // Limit dest sampling bounds to overlap source bounds
+  dstBounds.intersect(srcBounds);
+  // Compute the clipped bounds, relative to dstBounds.
+  IntRect clippedDest = dstBounds.intersection(clipRect) - dstBounds.origin();
+  // Check if clipped sampling bounds are empty
+  if (clippedDest.is_empty()) {
+    return;
+  }
+  // Compute final source bounds from clamped dest sampling bounds
+  srcBounds =
+      IntRect(dstBounds).scale(dstWidth, dstHeight, srcWidth, srcHeight);
+  // Calculate source and dest pointers from clamped offsets
+  int bpp = srctex.bpp();
+  int srcStride = srctex.stride();
+  int destStride = dsttex.stride();
+  char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ);
+  char* src = srctex.sample_ptr(srcReq, srcBounds, srcZ, invertY);
+  // Inverted Y must step downward along source rows
+  if (invertY) {
+    srcStride = -srcStride;
+  }
+  int span = clippedDest.width();
+  int fracX = srcWidth * clippedDest.x0;
+  int fracY = srcHeight * clippedDest.y0;
+  dest += destStride * clippedDest.y0;
+  dest += bpp * clippedDest.x0;
+  src += srcStride * (fracY / dstHeight);
+  src += bpp * (fracX / dstWidth);
+  fracY %= dstHeight;
+  fracX %= dstWidth;
+  for (int rows = clippedDest.height(); rows > 0; rows--) {
+    if (srcWidth == dstWidth) {
+      // No scaling, so just do a fast copy.
+      memcpy(dest, src, span * bpp);
+    } else {
+      // Do scaling with different source and dest widths.
+      switch (bpp) {
+        case 1:
+          scale_row((uint8_t*)dest, dstWidth, (uint8_t*)src, srcWidth, span,
+                    fracX);
+          break;
+        case 2:
+          scale_row((uint16_t*)dest, dstWidth, (uint16_t*)src, srcWidth, span,
+                    fracX);
+          break;
+        case 4:
+          scale_row((uint32_t*)dest, dstWidth, (uint32_t*)src, srcWidth, span,
+                    fracX);
+          break;
+        default:
+          assert(false);
+          break;
+      }
+    }
+    dest += destStride;
+    // Step source according to height ratio.
+    for (fracY += srcHeight; fracY >= dstHeight; fracY -= dstHeight) {
+      src += srcStride;
+    }
+  }
+}
+
+static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV,
+                            float srcDU, int srcZOffset,
+                            sampler2DArray sampler) {
+  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+  for (; span >= 4; span -= 4) {
+    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+    unaligned_store(dest, srcpx);
+    dest += 4;
+    uv.x += 4 * srcDU;
+  }
+  if (span > 0) {
+    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+    partial_store_span(dest, srcpx, span);
+  }
+}
+
+static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV,
+                            float srcDU, int srcZOffset,
+                            sampler2DArray sampler) {
+  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+  for (; span >= 4; span -= 4) {
+    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+    unaligned_store(dest, srcpx);
+    dest += 4;
+    uv.x += 4 * srcDU;
+  }
+  if (span > 0) {
+    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+    partial_store_span(dest, srcpx, span);
+  }
+}
+
+static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV,
+                            float srcDU, int srcZOffset,
+                            sampler2DArray sampler) {
+  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+  for (; span >= 4; span -= 4) {
+    auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv), srcZOffset);
+    unaligned_store(dest, srcpx);
+    dest += 4;
+    uv.x += 4 * srcDU;
+  }
+  if (span > 0) {
+    auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv), srcZOffset);
+    partial_store_span(dest, srcpx, span);
+  }
+}
+
+static NO_INLINE void linear_blit(Texture& srctex, const IntRect& srcReq,
+                                  int srcZ, Texture& dsttex,
+                                  const IntRect& dstReq, int dstZ, bool invertY,
+                                  const IntRect& clipRect) {
+  assert(srctex.internal_format == GL_RGBA8 ||
+         srctex.internal_format == GL_R8 || srctex.internal_format == GL_RG8);
+  // Compute valid dest bounds
+  IntRect dstBounds = dsttex.sample_bounds(dstReq);
+  dstBounds.intersect(clipRect);
+  // Check if sampling bounds are empty
+  if (dstBounds.is_empty()) {
+    return;
+  }
+  // Initialize sampler for source texture
+  sampler2DArray_impl sampler;
+  init_sampler(&sampler, srctex);
+  init_depth(&sampler, srctex);
+  sampler.filter = TextureFilter::LINEAR;
+  // Compute source UVs
+  int srcZOffset = srcZ * sampler.height_stride;
+  vec2_scalar srcUV(srcReq.x0, srcReq.y0);
+  vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
+                     float(srcReq.height()) / dstReq.height());
+  // Inverted Y must step downward along source rows
+  if (invertY) {
+    srcUV.y += srcReq.height();
+    srcDUV.y = -srcDUV.y;
+  }
+  // Skip to clamped source start
+  srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
+  // Scale UVs by lerp precision
+  srcUV = linearQuantize(srcUV, 128);
+  srcDUV *= 128.0f;
+  // Calculate dest pointer from clamped offsets
+  int bpp = dsttex.bpp();
+  int destStride = dsttex.stride();
+  char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ);
+  int span = dstBounds.width();
+  for (int rows = dstBounds.height(); rows > 0; rows--) {
+    switch (bpp) {
+      case 1:
+        linear_row_blit((uint8_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+                        &sampler);
+        break;
+      case 2:
+        linear_row_blit((uint16_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+                        &sampler);
+        break;
+      case 4:
+        linear_row_blit((uint32_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+                        &sampler);
+        break;
+      default:
+        assert(false);
+        break;
+    }
+    dest += destStride;
+    srcUV.y += srcDUV.y;
+  }
+}
+
+static void linear_row_composite(uint32_t* dest, int span,
+                                 const vec2_scalar& srcUV, float srcDU,
+                                 sampler2D sampler) {
+  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+  for (; span >= 4; span -= 4) {
+    WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv), 0);
+    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+    unaligned_store(dest, r);
+
+    dest += 4;
+    uv.x += 4 * srcDU;
+  }
+  if (span > 0) {
+    WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv), 0);
+    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, span));
+    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+    partial_store_span(dest, r, span);
+  }
+}
+
+static NO_INLINE void linear_composite(Texture& srctex, const IntRect& srcReq,
+                                       Texture& dsttex, const IntRect& dstReq,
+                                       bool invertY, const IntRect& clipRect) {
+  assert(srctex.bpp() == 4);
+  assert(dsttex.bpp() == 4);
+  // Compute valid dest bounds
+  IntRect dstBounds = dsttex.sample_bounds(dstReq);
+  dstBounds.intersect(clipRect);
+  // Check if sampling bounds are empty
+  if (dstBounds.is_empty()) {
+    return;
+  }
+  // Initialize sampler for source texture
+  sampler2D_impl sampler;
+  init_sampler(&sampler, srctex);
+  sampler.filter = TextureFilter::LINEAR;
+  // Compute source UVs
+  vec2_scalar srcUV(srcReq.x0, srcReq.y0);
+  vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
+                     float(srcReq.height()) / dstReq.height());
+  // Inverted Y must step downward along source rows
+  if (invertY) {
+    srcUV.y += srcReq.height();
+    srcDUV.y = -srcDUV.y;
+  }
+  // Skip to clamped source start
+  srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
+  // Scale UVs by lerp precision
+  srcUV = linearQuantize(srcUV, 128);
+  srcDUV *= 128.0f;
+  // Calculate dest pointer from clamped offsets
+  int destStride = dsttex.stride();
+  char* dest = dsttex.sample_ptr(dstReq, dstBounds, 0);
+  int span = dstBounds.width();
+  for (int rows = dstBounds.height(); rows > 0; rows--) {
+    linear_row_composite((uint32_t*)dest, span, srcUV, srcDUV.x, &sampler);
+    dest += destStride;
+    srcUV.y += srcDUV.y;
+  }
+}
+
+extern "C" {
+
+void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                     GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                     GLbitfield mask, GLenum filter) {
+  assert(mask == GL_COLOR_BUFFER_BIT);
+  Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER);
+  if (!srcfb || srcfb->layer < 0) return;
+  Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER);
+  if (!dstfb || dstfb->layer < 0) return;
+  Texture& srctex = ctx->textures[srcfb->color_attachment];
+  if (!srctex.buf || srcfb->layer >= max(srctex.depth, 1)) return;
+  Texture& dsttex = ctx->textures[dstfb->color_attachment];
+  if (!dsttex.buf || dstfb->layer >= max(dsttex.depth, 1)) return;
+  assert(!dsttex.locked);
+  if (srctex.internal_format != dsttex.internal_format) {
+    assert(false);
+    return;
+  }
+  // Force flipped Y onto dest coordinates
+  if (srcY1 < srcY0) {
+    swap(srcY0, srcY1);
+    swap(dstY0, dstY1);
+  }
+  bool invertY = dstY1 < dstY0;
+  if (invertY) {
+    swap(dstY0, dstY1);
+  }
+  IntRect srcReq = IntRect{srcX0, srcY0, srcX1, srcY1} - srctex.offset;
+  IntRect dstReq = IntRect{dstX0, dstY0, dstX1, dstY1} - dsttex.offset;
+  if (srcReq.is_empty() || dstReq.is_empty()) {
+    return;
+  }
+  IntRect clipRect = {0, 0, dstReq.width(), dstReq.height()};
+  prepare_texture(srctex);
+  prepare_texture(dsttex, &dstReq);
+  if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR &&
+      (srctex.internal_format == GL_RGBA8 || srctex.internal_format == GL_R8 ||
+       srctex.internal_format == GL_RG8)) {
+    linear_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+                invertY, dstReq);
+  } else {
+    scale_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+               invertY, clipRect);
+  }
+}
+
+typedef Texture LockedTexture;
+
+// Lock the given texture to prevent modification.
+LockedTexture* LockTexture(GLuint texId) {
+  Texture& tex = ctx->textures[texId];
+  if (!tex.buf) {
+    assert(tex.buf != nullptr);
+    return nullptr;
+  }
+  if (__sync_fetch_and_add(&tex.locked, 1) == 0) {
+    // If this is the first time locking the texture, flush any delayed clears.
+    prepare_texture(tex);
+  }
+  return (LockedTexture*)&tex;
+}
+
+// Lock the given framebuffer's color attachment to prevent modification.
+LockedTexture* LockFramebuffer(GLuint fboId) {
+  Framebuffer& fb = ctx->framebuffers[fboId];
+  // Only allow locking a framebuffer if it has a valid color attachment and
+  // only if targeting the first layer.
+  if (!fb.color_attachment || fb.layer > 0) {
+    assert(fb.color_attachment != 0);
+    assert(fb.layer == 0);
+    return nullptr;
+  }
+  return LockTexture(fb.color_attachment);
+}
+
+// Reference an already locked resource
+void LockResource(LockedTexture* resource) {
+  if (!resource) {
+    return;
+  }
+  __sync_fetch_and_add(&resource->locked, 1);
+}
+
+// Remove a lock on a texture that has been previously locked
+void UnlockResource(LockedTexture* resource) {
+  if (!resource) {
+    return;
+  }
+  if (__sync_fetch_and_add(&resource->locked, -1) <= 0) {
+    // The lock should always be non-zero before unlocking.
+    assert(0);
+  }
+}
+
+// Get the underlying buffer for a locked resource
+void* GetResourceBuffer(LockedTexture* resource, int32_t* width,
+                        int32_t* height, int32_t* stride) {
+  *width = resource->width;
+  *height = resource->height;
+  *stride = resource->stride();
+  return resource->buf;
+}
+
+static void unscaled_row_composite(uint32_t* dest, const uint32_t* src,
+                                   int span) {
+  const uint32_t* end = src + span;
+  while (src + 4 <= end) {
+    WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+    unaligned_store(dest, r);
+    src += 4;
+    dest += 4;
+  }
+  if (src < end) {
+    WideRGBA8 srcpx = unpack(partial_load_span<PackedRGBA8>(src, end - src));
+    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, end - src));
+    auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+    partial_store_span(dest, r, end - src);
+  }
+}
+
+static NO_INLINE void unscaled_composite(Texture& srctex, const IntRect& srcReq,
+                                         Texture& dsttex, const IntRect& dstReq,
+                                         bool invertY,
+                                         const IntRect& clipRect) {
+  IntRect bounds = dsttex.sample_bounds(dstReq);
+  bounds.intersect(clipRect);
+  bounds.intersect(srctex.sample_bounds(srcReq, invertY));
+  char* dest = dsttex.sample_ptr(dstReq, bounds, 0);
+  char* src = srctex.sample_ptr(srcReq, bounds, 0, invertY);
+  int srcStride = srctex.stride();
+  int destStride = dsttex.stride();
+  if (invertY) {
+    srcStride = -srcStride;
+  }
+  for (int rows = bounds.height(); rows > 0; rows--) {
+    unscaled_row_composite((uint32_t*)dest, (const uint32_t*)src,
+                           bounds.width());
+    dest += destStride;
+    src += srcStride;
+  }
+}
+
+// Extension for optimized compositing of textures or framebuffers that may be
+// safely used across threads. The source and destination must be locked to
+// ensure that they can be safely accessed while the SWGL context might be used
+// by another thread. Band extents along the Y axis may be used to clip the
+// destination rectangle without effecting the integer scaling ratios.
+void Composite(LockedTexture* lockedDst, LockedTexture* lockedSrc, GLint srcX,
+               GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
+               GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
+               GLboolean opaque, GLboolean flip, GLenum filter, GLint clipX,
+               GLint clipY, GLsizei clipWidth, GLsizei clipHeight) {
+  if (!lockedDst || !lockedSrc) {
+    return;
+  }
+  Texture& srctex = *lockedSrc;
+  Texture& dsttex = *lockedDst;
+  assert(srctex.bpp() == 4);
+  assert(dsttex.bpp() == 4);
+
+  IntRect srcReq =
+      IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - srctex.offset;
+  IntRect dstReq =
+      IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
+  // Compute clip rect as relative to the dstReq, as that's the same coords
+  // as used for the sampling bounds.
+  IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
+                      clipY - dstY + clipHeight};
+
+  if (opaque) {
+    // Ensure we have rows of at least 2 pixels when using the linear filter
+    // to avoid overreading the row.
+    if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) {
+      linear_blit(srctex, srcReq, 0, dsttex, dstReq, 0, flip, clipRect);
+    } else {
+      scale_blit(srctex, srcReq, 0, dsttex, dstReq, 0, flip, clipRect);
+    }
+  } else {
+    if (!srcReq.same_size(dstReq) && srctex.width >= 2) {
+      linear_composite(srctex, srcReq, dsttex, dstReq, flip, clipRect);
+    } else {
+      unscaled_composite(srctex, srcReq, dsttex, dstReq, flip, clipRect);
+    }
+  }
+}
+
+}  // extern "C"
+
+// Saturated add helper for YUV conversion. Supported platforms have intrinsics
+// to do this natively, but support a slower generic fallback just in case.
+static inline V8<int16_t> addsat(V8<int16_t> x, V8<int16_t> y) {
+#if USE_SSE2
+  return _mm_adds_epi16(x, y);
+#elif USE_NEON
+  return vqaddq_s16(x, y);
+#else
+  auto r = x + y;
+  // An overflow occurred if the signs of both inputs x and y did not differ
+  // but yet the sign of the result did differ.
+  auto overflow = (~(x ^ y) & (r ^ x)) >> 15;
+  // If there was an overflow, we need to choose the appropriate limit to clamp
+  // to depending on whether or not the inputs are negative.
+  auto limit = (x >> 15) ^ 0x7FFF;
+  // If we didn't overflow, just use the result, and otherwise, use the limit.
+  return (~overflow & r) | (overflow & limit);
+#endif
+}
+
+// Interleave and packing helper for YUV conversion. During transform by the
+// color matrix, the color components are de-interleaved as this format is
+// usually what comes out of the planar YUV textures. The components thus need
+// to be interleaved before finally getting packed to BGRA format. Alpha is
+// forced to be opaque.
+static inline PackedRGBA8 packYUV(V8<int16_t> gg, V8<int16_t> br) {
+  return pack(bit_cast<WideRGBA8>(zip(br, gg))) |
+         PackedRGBA8{0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+}
+
+enum YUVColorSpace { REC_601 = 0, REC_709, REC_2020, IDENTITY };
+
+// clang-format off
+// Supports YUV color matrixes of the form:
+// [R]   [1.1643835616438356,  0.0,  rv ]   [Y -  16]
+// [G] = [1.1643835616438358, -gu,  -gv ] x [U - 128]
+// [B]   [1.1643835616438356,  bu,  0.0 ]   [V - 128]
+// We must be able to multiply a YUV input by a matrix coefficient ranging as
+// high as ~2.2 in the U/V cases, where U/V can be signed values between -128
+// and 127. The largest fixed-point representation we can thus support without
+// overflowing 16 bit integers leaves us 6 bits of fractional precision while
+// also supporting a sign bit. The closest representation of the Y coefficient
+// ~1.164 in this precision is 74.5/2^6 which is common to all color spaces
+// we support. Conversions can still sometimes overflow the precision and
+// require clamping back into range, so we use saturated additions to do this
+// efficiently at no extra cost.
+// clang-format on
+template <const double MATRIX[4]>
+struct YUVConverterImpl {
+  static inline PackedRGBA8 convert(V8<int16_t> yy, V8<int16_t> uv) {
+    // Convert matrix coefficients to fixed-point representation.
+    constexpr int16_t mrv = int16_t(MATRIX[0] * 64.0 + 0.5);
+    constexpr int16_t mgu = -int16_t(MATRIX[1] * -64.0 + 0.5);
+    constexpr int16_t mgv = -int16_t(MATRIX[2] * -64.0 + 0.5);
+    constexpr int16_t mbu = int16_t(MATRIX[3] * 64.0 + 0.5);
+
+    // Bias Y values by -16 and multiply by 74.5. Add 2^5 offset to round to
+    // nearest 2^6.
+    yy = yy * 74 + (yy >> 1) + (int16_t(-16 * 74.5) + (1 << 5));
+
+    // Bias U/V values by -128.
+    uv -= 128;
+
+    // Compute (R, B) = (74.5*Y + rv*V, 74.5*Y + bu*U)
+    auto br = V8<int16_t>{mbu, mrv, mbu, mrv, mbu, mrv, mbu, mrv} * uv;
+    br = addsat(yy, br);
+    br >>= 6;
+
+    // Compute G = 74.5*Y + -gu*U + -gv*V
+    auto gg = V8<int16_t>{mgu, mgv, mgu, mgv, mgu, mgv, mgu, mgv} * uv;
+    gg = addsat(
+        yy,
+        addsat(gg, bit_cast<V8<int16_t>>(bit_cast<V4<uint32_t>>(gg) >> 16)));
+    gg >>= 6;
+
+    // Interleave B/R and G values. Force alpha to opaque.
+    return packYUV(gg, br);
+  }
+};
+
+template <YUVColorSpace COLOR_SPACE>
+struct YUVConverter {};
+
+// clang-format off
+// From Rec601:
+// [R]   [1.1643835616438356,  0.0,                 1.5960267857142858   ]   [Y -  16]
+// [G] = [1.1643835616438358, -0.3917622900949137, -0.8129676472377708   ] x [U - 128]
+// [B]   [1.1643835616438356,  2.017232142857143,   8.862867620416422e-17]   [V - 128]
+// clang-format on
+constexpr double YUVMatrix601[4] = {1.5960267857142858, -0.3917622900949137,
+                                    -0.8129676472377708, 2.017232142857143};
+template <>
+struct YUVConverter<REC_601> : YUVConverterImpl<YUVMatrix601> {};
+
+// clang-format off
+// From Rec709:
+// [R]   [1.1643835616438356,  0.0,                    1.7927410714285714]   [Y -  16]
+// [G] = [1.1643835616438358, -0.21324861427372963,   -0.532909328559444 ] x [U - 128]
+// [B]   [1.1643835616438356,  2.1124017857142854,     0.0               ]   [V - 128]
+// clang-format on
+static constexpr double YUVMatrix709[4] = {
+    1.7927410714285714, -0.21324861427372963, -0.532909328559444,
+    2.1124017857142854};
+template <>
+struct YUVConverter<REC_709> : YUVConverterImpl<YUVMatrix709> {};
+
+// clang-format off
+// From Re2020:
+// [R]   [1.16438356164384,  0.0,                    1.678674107142860 ]   [Y -  16]
+// [G] = [1.16438356164384, -0.187326104219343,     -0.650424318505057 ] x [U - 128]
+// [B]   [1.16438356164384,  2.14177232142857,       0.0               ]   [V - 128]
+// clang-format on
+static constexpr double YUVMatrix2020[4] = {
+    1.678674107142860, -0.187326104219343, -0.650424318505057,
+    2.14177232142857};
+template <>
+struct YUVConverter<REC_2020> : YUVConverterImpl<YUVMatrix2020> {};
+
+// clang-format off
+// [R]   [V]
+// [G] = [Y]
+// [B]   [U]
+// clang-format on
+template <>
+struct YUVConverter<IDENTITY> {
+  static inline PackedRGBA8 convert(V8<int16_t> y, V8<int16_t> uv) {
+    // Map U/V directly to B/R and map Y directly to G with opaque alpha.
+    return packYUV(y, uv);
+  }
+};
+
+// Helper function for textureLinearRowR8 that samples horizontal taps and
+// combines them based on Y fraction with next row.
+template <typename S>
+static ALWAYS_INLINE V8<int16_t> linearRowTapsR8(S sampler, I32 ix,
+                                                 int32_t offsety,
+                                                 int32_t stridey,
+                                                 int16_t fracy) {
+  uint8_t* buf = (uint8_t*)sampler->buf + offsety;
+  auto a0 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
+  auto b0 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
+  auto c0 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
+  auto d0 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
+  auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8<int16_t>);
+  buf += stridey;
+  auto a1 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
+  auto b1 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
+  auto c1 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
+  auto d1 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
+  auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8<int16_t>);
+  abcd0 += ((abcd1 - abcd0) * fracy) >> 7;
+  return abcd0;
+}
+
+// Optimized version of textureLinearPackedR8 for Y R8 texture. This assumes
+// constant Y and returns a duplicate of the result interleaved with itself
+// to aid in later YUV transformation.
+template <typename S>
+static inline V8<int16_t> textureLinearRowR8(S sampler, I32 ix, int32_t offsety,
+                                             int32_t stridey, int16_t fracy) {
+  assert(sampler->format == TextureFormat::R8);
+
+  // Calculate X fraction and clamp X offset into range.
+  I32 fracx = ix;
+  ix >>= 7;
+  fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
+  ix = clampCoord(ix, sampler->width - 1);
+
+  // Load the sample taps and combine rows.
+  auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
+
+  // Unzip the result and do final horizontal multiply-add base on X fraction.
+  auto abcdl = SHUFFLE(abcd, abcd, 0, 0, 2, 2, 4, 4, 6, 6);
+  auto abcdh = SHUFFLE(abcd, abcd, 1, 1, 3, 3, 5, 5, 7, 7);
+  abcdl += ((abcdh - abcdl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
+
+  // The final result is the packed values interleaved with a duplicate of
+  // themselves.
+  return abcdl;
+}
+
+// Optimized version of textureLinearPackedR8 for paired U/V R8 textures.
+// Since the two textures have the same dimensions and stride, the addressing
+// math can be shared between both samplers. This also allows a coalesced
+// multiply in the final stage by packing both U/V results into a single
+// operation.
+template <typename S>
+static inline V8<int16_t> textureLinearRowPairedR8(S sampler, S sampler2,
+                                                   I32 ix, int32_t offsety,
+                                                   int32_t stridey,
+                                                   int16_t fracy) {
+  assert(sampler->format == TextureFormat::R8 &&
+         sampler2->format == TextureFormat::R8);
+  assert(sampler->width == sampler2->width &&
+         sampler->height == sampler2->height);
+  assert(sampler->stride == sampler2->stride);
+
+  // Calculate X fraction and clamp X offset into range.
+  I32 fracx = ix;
+  ix >>= 7;
+  fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
+  ix = clampCoord(ix, sampler->width - 1);
+
+  // Load the sample taps for the first sampler and combine rows.
+  auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
+
+  // Load the sample taps for the second sampler and combine rows.
+  auto xyzw = linearRowTapsR8(sampler2, ix, offsety, stridey, fracy);
+
+  // We are left with a result vector for each sampler with values for adjacent
+  // pixels interleaved together in each. We need to unzip these values so that
+  // we can do the final horizontal multiply-add based on the X fraction.
+  auto abcdxyzwl = SHUFFLE(abcd, xyzw, 0, 8, 2, 10, 4, 12, 6, 14);
+  auto abcdxyzwh = SHUFFLE(abcd, xyzw, 1, 9, 3, 11, 5, 13, 7, 15);
+  abcdxyzwl += ((abcdxyzwh - abcdxyzwl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
+
+  // The final result is the packed values for the first sampler interleaved
+  // with the packed values for the second sampler.
+  return abcdxyzwl;
+}
+
+template <YUVColorSpace COLOR_SPACE>
+static void linear_row_yuv(uint32_t* dest, int span, const vec2_scalar& srcUV,
+                           float srcDU, const vec2_scalar& chromaUV,
+                           float chromaDU, sampler2D_impl sampler[3],
+                           int colorDepth) {
+  // Casting to int loses some precision while stepping that can offset the
+  // image, so shift the values by some extra bits of precision to minimize
+  // this. We support up to 16 bits of image size, 7 bits of quantization,
+  // and 1 bit for sign, which leaves 8 bits left for extra precision.
+  const int STEP_BITS = 8;
+
+  // Calculate varying and constant interp data for Y plane.
+  I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS));
+  int32_t yV = int32_t(srcUV.y);
+
+  // Calculate varying and constant interp data for chroma planes.
+  I32 cU = cast(init_interp(chromaUV.x, chromaDU) * (1 << STEP_BITS));
+  int32_t cV = int32_t(chromaUV.y);
+
+  // We need to skip 4 pixels per chunk.
+  int32_t yDU = int32_t((4 << STEP_BITS) * srcDU);
+  int32_t cDU = int32_t((4 << STEP_BITS) * chromaDU);
+
+  if (sampler[0].width < 2 || sampler[1].width < 2) {
+    // If the source row has less than 2 pixels, it's not safe to use a linear
+    // filter because it may overread the row. Just convert the single pixel
+    // with nearest filtering and fill the row with it.
+    I16 yuv =
+        CONVERT(round_pixel((Float){
+                    texelFetch(&sampler[0], ivec2(srcUV), 0).x.x,
+                    texelFetch(&sampler[1], ivec2(chromaUV), 0).x.x,
+                    texelFetch(&sampler[2], ivec2(chromaUV), 0).x.x, 1.0f}),
+                I16);
+    auto rgb = YUVConverter<COLOR_SPACE>::convert(zip(I16(yuv.x), I16(yuv.x)),
+                                                  zip(I16(yuv.y), I16(yuv.z)));
+    for (; span >= 4; span -= 4) {
+      unaligned_store(dest, rgb);
+      dest += 4;
+    }
+    if (span > 0) {
+      partial_store_span(dest, rgb, span);
+    }
+  } else if (sampler[0].format == TextureFormat::R16) {
+    // Sample each YUV plane, rescale it to fit in low 8 bits of word, and then
+    // transform them by the appropriate color space.
+    assert(colorDepth > 8);
+    // Need to right shift the sample by the amount of bits over 8 it occupies.
+    // On output from textureLinearUnpackedR16, we have lost 1 bit of precision
+    // at the low end already, hence 1 is subtracted from the color depth.
+    int rescaleBits = (colorDepth - 1) - 8;
+    for (; span >= 4; span -= 4) {
+      auto yPx =
+          textureLinearUnpackedR16(&sampler[0], ivec2(yU >> STEP_BITS, yV)) >>
+          rescaleBits;
+      auto uPx =
+          textureLinearUnpackedR16(&sampler[1], ivec2(cU >> STEP_BITS, cV)) >>
+          rescaleBits;
+      auto vPx =
+          textureLinearUnpackedR16(&sampler[2], ivec2(cU >> STEP_BITS, cV)) >>
+          rescaleBits;
+      unaligned_store(dest, YUVConverter<COLOR_SPACE>::convert(zip(yPx, yPx),
+                                                               zip(uPx, vPx)));
+      dest += 4;
+      yU += yDU;
+      cU += cDU;
+    }
+    if (span > 0) {
+      // Handle any remaining pixels...
+      auto yPx =
+          textureLinearUnpackedR16(&sampler[0], ivec2(yU >> STEP_BITS, yV)) >>
+          rescaleBits;
+      auto uPx =
+          textureLinearUnpackedR16(&sampler[1], ivec2(cU >> STEP_BITS, cV)) >>
+          rescaleBits;
+      auto vPx =
+          textureLinearUnpackedR16(&sampler[2], ivec2(cU >> STEP_BITS, cV)) >>
+          rescaleBits;
+      partial_store_span(
+          dest,
+          YUVConverter<COLOR_SPACE>::convert(zip(yPx, yPx), zip(uPx, vPx)),
+          span);
+    }
+  } else {
+    assert(sampler[0].format == TextureFormat::R8);
+    assert(colorDepth == 8);
+
+    // Calculate varying and constant interp data for Y plane.
+    int16_t yFracV = yV & 0x7F;
+    yV >>= 7;
+    int32_t yOffsetV = clampCoord(yV, sampler[0].height) * sampler[0].stride;
+    int32_t yStrideV =
+        yV >= 0 && yV < int32_t(sampler[0].height) - 1 ? sampler[0].stride : 0;
+
+    // Calculate varying and constant interp data for chroma planes.
+    int16_t cFracV = cV & 0x7F;
+    cV >>= 7;
+    int32_t cOffsetV = clampCoord(cV, sampler[1].height) * sampler[1].stride;
+    int32_t cStrideV =
+        cV >= 0 && cV < int32_t(sampler[1].height) - 1 ? sampler[1].stride : 0;
+
+    for (; span >= 4; span -= 4) {
+      // Sample each YUV plane and then transform them by the appropriate color
+      // space.
+      auto yPx = textureLinearRowR8(&sampler[0], yU >> STEP_BITS, yOffsetV,
+                                    yStrideV, yFracV);
+      auto uvPx =
+          textureLinearRowPairedR8(&sampler[1], &sampler[2], cU >> STEP_BITS,
+                                   cOffsetV, cStrideV, cFracV);
+      unaligned_store(dest, YUVConverter<COLOR_SPACE>::convert(yPx, uvPx));
+      dest += 4;
+      yU += yDU;
+      cU += cDU;
+    }
+    if (span > 0) {
+      // Handle any remaining pixels...
+      auto yPx = textureLinearRowR8(&sampler[0], yU >> STEP_BITS, yOffsetV,
+                                    yStrideV, yFracV);
+      auto uvPx =
+          textureLinearRowPairedR8(&sampler[1], &sampler[2], cU >> STEP_BITS,
+                                   cOffsetV, cStrideV, cFracV);
+      partial_store_span(dest, YUVConverter<COLOR_SPACE>::convert(yPx, uvPx),
+                         span);
+    }
+  }
+}
+
+static void linear_convert_yuv(Texture& ytex, Texture& utex, Texture& vtex,
+                               YUVColorSpace colorSpace, int colorDepth,
+                               const IntRect& srcReq, Texture& dsttex,
+                               const IntRect& dstReq, bool invertY,
+                               const IntRect& clipRect) {
+  // Compute valid dest bounds
+  IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
+  dstBounds.intersect(clipRect);
+  // Check if sampling bounds are empty
+  if (dstBounds.is_empty()) {
+    return;
+  }
+  // Initialize samplers for source textures
+  sampler2D_impl sampler[3];
+  init_sampler(&sampler[0], ytex);
+  init_sampler(&sampler[1], utex);
+  init_sampler(&sampler[2], vtex);
+
+  // Compute source UVs
+  vec2_scalar srcUV(srcReq.x0, srcReq.y0);
+  vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
+                     float(srcReq.height()) / dstReq.height());
+  // Inverted Y must step downward along source rows
+  if (invertY) {
+    srcUV.y += srcReq.height();
+    srcDUV.y = -srcDUV.y;
+  }
+  // Skip to clamped source start
+  srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
+  // Calculate separate chroma UVs for chroma planes with different scale
+  vec2_scalar chromaScale(float(utex.width) / ytex.width,
+                          float(utex.height) / ytex.height);
+  vec2_scalar chromaUV = srcUV * chromaScale;
+  vec2_scalar chromaDUV = srcDUV * chromaScale;
+  // Scale UVs by lerp precision. If the row has only 1 pixel, then don't
+  // quantize so that we can use nearest filtering instead to avoid overreads.
+  if (ytex.width >= 2 && utex.width >= 2) {
+    srcUV = linearQuantize(srcUV, 128);
+    srcDUV *= 128.0f;
+    chromaUV = linearQuantize(chromaUV, 128);
+    chromaDUV *= 128.0f;
+  }
+  // Calculate dest pointer from clamped offsets
+  int destStride = dsttex.stride();
+  char* dest = dsttex.sample_ptr(dstReq, dstBounds, 0);
+  int span = dstBounds.width();
+  for (int rows = dstBounds.height(); rows > 0; rows--) {
+    switch (colorSpace) {
+      case REC_601:
+        linear_row_yuv<REC_601>((uint32_t*)dest, span, srcUV, srcDUV.x,
+                                chromaUV, chromaDUV.x, sampler, colorDepth);
+        break;
+      case REC_709:
+        linear_row_yuv<REC_709>((uint32_t*)dest, span, srcUV, srcDUV.x,
+                                chromaUV, chromaDUV.x, sampler, colorDepth);
+        break;
+      case REC_2020:
+        linear_row_yuv<REC_2020>((uint32_t*)dest, span, srcUV, srcDUV.x,
+                                 chromaUV, chromaDUV.x, sampler, colorDepth);
+        break;
+      case IDENTITY:
+        linear_row_yuv<IDENTITY>((uint32_t*)dest, span, srcUV, srcDUV.x,
+                                 chromaUV, chromaDUV.x, sampler, colorDepth);
+        break;
+      default:
+        debugf("unknown YUV color space %d\n", colorSpace);
+        assert(false);
+        break;
+    }
+    dest += destStride;
+    srcUV.y += srcDUV.y;
+    chromaUV.y += chromaDUV.y;
+  }
+}
+
+extern "C" {
+
+// Extension for compositing a YUV surface represented by separate YUV planes
+// to a BGRA destination. The supplied color space is used to determine the
+// transform from YUV to BGRA after sampling.
+void CompositeYUV(LockedTexture* lockedDst, LockedTexture* lockedY,
+                  LockedTexture* lockedU, LockedTexture* lockedV,
+                  YUVColorSpace colorSpace, GLuint colorDepth, GLint srcX,
+                  GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
+                  GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
+                  GLboolean flip, GLint clipX, GLint clipY, GLsizei clipWidth,
+                  GLsizei clipHeight) {
+  if (!lockedDst || !lockedY || !lockedU || !lockedV) {
+    return;
+  }
+  Texture& ytex = *lockedY;
+  Texture& utex = *lockedU;
+  Texture& vtex = *lockedV;
+  Texture& dsttex = *lockedDst;
+  // All YUV planes must currently be represented by R8 or R16 textures.
+  // The chroma (U/V) planes must have matching dimensions.
+  assert(ytex.bpp() == utex.bpp() && ytex.bpp() == vtex.bpp());
+  assert((ytex.bpp() == 1 && colorDepth == 8) ||
+         (ytex.bpp() == 2 && colorDepth > 8));
+  // assert(ytex.width == utex.width && ytex.height == utex.height);
+  assert(utex.width == vtex.width && utex.height == vtex.height);
+  assert(ytex.offset == utex.offset && ytex.offset == vtex.offset);
+  assert(dsttex.bpp() == 4);
+
+  IntRect srcReq =
+      IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - ytex.offset;
+  IntRect dstReq =
+      IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
+  // Compute clip rect as relative to the dstReq, as that's the same coords
+  // as used for the sampling bounds.
+  IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
+                      clipY - dstY + clipHeight};
+  // For now, always use a linear filter path that would be required for
+  // scaling. Further fast-paths for non-scaled video might be desirable in the
+  // future.
+  linear_convert_yuv(ytex, utex, vtex, colorSpace, colorDepth, srcReq, dsttex,
+                     dstReq, flip, clipRect);
+}
+
+}  // extern "C"