summaryrefslogtreecommitdiffstats
path: root/gfx/wr/swgl
diff options
context:
space:
mode:
Diffstat (limited to 'gfx/wr/swgl')
-rw-r--r--gfx/wr/swgl/Cargo.toml15
-rw-r--r--gfx/wr/swgl/README.md45
-rw-r--r--gfx/wr/swgl/build.rs148
-rw-r--r--gfx/wr/swgl/src/composite.h922
-rw-r--r--gfx/wr/swgl/src/gl.cc4445
-rw-r--r--gfx/wr/swgl/src/gl_defs.h193
-rw-r--r--gfx/wr/swgl/src/glsl.h2669
-rw-r--r--gfx/wr/swgl/src/lib.rs12
-rw-r--r--gfx/wr/swgl/src/program.h166
-rw-r--r--gfx/wr/swgl/src/swgl_ext.h532
-rw-r--r--gfx/wr/swgl/src/swgl_fns.rs2490
-rw-r--r--gfx/wr/swgl/src/texture.h1262
-rw-r--r--gfx/wr/swgl/src/vector_type.h514
13 files changed, 13413 insertions, 0 deletions
diff --git a/gfx/wr/swgl/Cargo.toml b/gfx/wr/swgl/Cargo.toml
new file mode 100644
index 0000000000..9b7624b13e
--- /dev/null
+++ b/gfx/wr/swgl/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "swgl"
+version = "0.1.0"
+license = "MPL-2.0"
+authors = ["The Mozilla Project Developers"]
+build = "build.rs"
+description = "Software OpenGL implementation for WebRender."
+
+[build-dependencies]
+cc = "1.0.46"
+glsl-to-cxx = { path = "../glsl-to-cxx" }
+webrender_build = { path = "../webrender_build" }
+
+[dependencies]
+gleam = "0.13.1"
diff --git a/gfx/wr/swgl/README.md b/gfx/wr/swgl/README.md
new file mode 100644
index 0000000000..eac1ec3798
--- /dev/null
+++ b/gfx/wr/swgl/README.md
@@ -0,0 +1,45 @@
+# swgl
+
+Software OpenGL implementation for WebRender
+
+## Overview
+This is a relatively simple single threaded software rasterizer designed
+for use by WebRender. It will shade one quad at a time using a 4xf32 vector
+with one vertex per lane. It rasterizes quads usings spans and shades that
+span 4 pixels at a time.
+
+## Building
+clang-cl is required to build on Windows. This can be done by installing
+the llvm binaries from https://releases.llvm.org/ and adding the installation
+to the path with something like `set PATH=%PATH%;C:\Program Files\LLVM\bin`.
+Then `set CC=clang-cl` and `set CXX=clang-cl`. That should be sufficient
+for `cc-rs` to use `clang-cl` instead of `cl`.
+
+## Extensions
+SWGL contains a number of OpenGL and GLSL extensions designed to both ease
+integration with WebRender and to help accelerate span rasterization.
+
+GLSL extension intrinsics are generally prefixed with `swgl_` to distinguish
+them from other items in the GLSL namespace.
+
+Inside GLSL, the `SWGL` preprocessor token is defined so that usage of SWGL
+extensions may be conditionally compiled.
+
+```
+void swgl_clipMask(sampler2D mask, vec2 offset, vec2 bb_origin, vec2 bb_size);
+```
+
+When called from the the vertex shader, this specifies a clip mask texture to
+be used to mask the currently drawn primitive while blending is enabled. This
+mask will only apply to the current primitive.
+
+The mask must be an R8 texture that will be interpreted as alpha weighting
+applied to the source pixel prior to the blend stage. It is sampled 1:1 with
+nearest filtering without any applied transform. The given offset specifies
+the positioning of the clip mask relative to the framebuffer's viewport.
+
+The supplied bounding box constrains sampling of the clip mask to only fall
+within the given rectangle, specified relative to the clip mask offset.
+Anything falling outside this rectangle will be clipped entirely. If the
+rectangle is empty, then the clip mask will be ignored.
+
diff --git a/gfx/wr/swgl/build.rs b/gfx/wr/swgl/build.rs
new file mode 100644
index 0000000000..b20dacd593
--- /dev/null
+++ b/gfx/wr/swgl/build.rs
@@ -0,0 +1,148 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+extern crate cc;
+extern crate glsl_to_cxx;
+extern crate webrender_build;
+
+use std::collections::HashSet;
+use std::fmt::Write;
+use webrender_build::shader::{ShaderFeatureFlags, get_shader_features};
+
+// Shader key is in "name feature,feature" format.
+// File name needs to be formatted as "name_feature_feature".
+fn shader_file(shader_key: &str) -> String {
+ shader_key.replace(' ', "_").replace(',', "_")
+}
+
+fn write_load_shader(shader_keys: &[String]) {
+ let mut load_shader = String::new();
+ for s in shader_keys {
+ let _ = write!(load_shader, "#include \"{}.h\"\n", shader_file(s));
+ }
+ load_shader.push_str("ProgramLoader load_shader(const char* name) {\n");
+ for s in shader_keys {
+ let _ = write!(load_shader, " if (!strcmp(name, \"{}\")) {{ return {}_program::loader; }}\n",
+ s, shader_file(s));
+ }
+ load_shader.push_str(" return nullptr;\n}\n");
+ std::fs::write(std::env::var("OUT_DIR").unwrap() + "/load_shader.h", load_shader).unwrap();
+}
+
+fn process_imports(shader_dir: &str, shader: &str, included: &mut HashSet<String>, output: &mut String) {
+ if !included.insert(shader.into()) {
+ return;
+ }
+ println!("cargo:rerun-if-changed={}/{}.glsl", shader_dir, shader);
+ let source = std::fs::read_to_string(format!("{}/{}.glsl", shader_dir, shader)).unwrap();
+ for line in source.lines() {
+ if line.starts_with("#include ") {
+ let imports = line["#include ".len() ..].split(',');
+ for import in imports {
+ process_imports(shader_dir, import, included, output);
+ }
+ } else if line.starts_with("#version ") || line.starts_with("#extension ") {
+ // ignore
+ } else {
+ output.push_str(line);
+ output.push('\n');
+ }
+ }
+}
+
+fn translate_shader(shader_key: &str, shader_dir: &str) {
+ let mut imported = String::from("#define SWGL 1\n");
+ let _ = write!(imported, "#define WR_MAX_VERTEX_TEXTURE_WIDTH {}U\n",
+ webrender_build::MAX_VERTEX_TEXTURE_WIDTH);
+
+ let (basename, features) =
+ shader_key.split_at(shader_key.find(' ').unwrap_or(shader_key.len()));
+ if !features.is_empty() {
+ for feature in features.trim().split(',') {
+ let _ = write!(imported, "#define WR_FEATURE_{}\n", feature);
+ }
+ }
+
+ process_imports(shader_dir, basename, &mut HashSet::new(), &mut imported);
+
+ let shader = shader_file(shader_key);
+
+ let out_dir = std::env::var("OUT_DIR").unwrap();
+ let imp_name = format!("{}/{}.c", out_dir, shader);
+ std::fs::write(&imp_name, imported).unwrap();
+
+ let mut build = cc::Build::new();
+ if build.get_compiler().is_like_msvc() {
+ build.flag("/EP");
+ } else {
+ build.flag("-xc").flag("-P");
+ }
+ build.file(&imp_name);
+ let vs = build.clone()
+ .define("WR_VERTEX_SHADER", Some("1"))
+ .expand();
+ let fs = build.clone()
+ .define("WR_FRAGMENT_SHADER", Some("1"))
+ .expand();
+ let vs_name = format!("{}/{}.vert", out_dir, shader);
+ let fs_name = format!("{}/{}.frag", out_dir, shader);
+ std::fs::write(&vs_name, vs).unwrap();
+ std::fs::write(&fs_name, fs).unwrap();
+
+ let args = vec![
+ "glsl_to_cxx".to_string(),
+ vs_name,
+ fs_name,
+ ];
+ let result = glsl_to_cxx::translate(&mut args.into_iter());
+ std::fs::write(format!("{}/{}.h", out_dir, shader), result).unwrap();
+}
+
+fn main() {
+ let shader_dir = match std::env::var("MOZ_SRC") {
+ Ok(dir) => dir + "/gfx/wr/webrender/res",
+ Err(_) => std::env::var("CARGO_MANIFEST_DIR").unwrap() + "/../webrender/res",
+ };
+
+ let shader_flags =
+ ShaderFeatureFlags::GL |
+ ShaderFeatureFlags::DUAL_SOURCE_BLENDING |
+ ShaderFeatureFlags::DEBUG;
+ let mut shaders: Vec<String> = Vec::new();
+ for (name, features) in get_shader_features(shader_flags) {
+ shaders.extend(features.iter().map(|f| {
+ if f.is_empty() { name.to_owned() } else { format!("{} {}", name, f) }
+ }));
+ }
+
+ shaders.sort();
+
+ for shader in &shaders {
+ translate_shader(shader, &shader_dir);
+ }
+
+ write_load_shader(&shaders);
+
+ println!("cargo:rerun-if-changed=src/composite.h");
+ println!("cargo:rerun-if-changed=src/gl_defs.h");
+ println!("cargo:rerun-if-changed=src/glsl.h");
+ println!("cargo:rerun-if-changed=src/program.h");
+ println!("cargo:rerun-if-changed=src/swgl_ext.h");
+ println!("cargo:rerun-if-changed=src/texture.h");
+ println!("cargo:rerun-if-changed=src/vector_type.h");
+ println!("cargo:rerun-if-changed=src/gl.cc");
+ cc::Build::new()
+ .cpp(true)
+ .file("src/gl.cc")
+ .flag("-std=c++14")
+ .flag("-UMOZILLA_CONFIG_H")
+ .flag("-fno-exceptions")
+ .flag("-fno-rtti")
+ .flag("-fno-math-errno")
+ .define("_GLIBCXX_USE_CXX11_ABI", Some("0"))
+ .include(shader_dir)
+ .include("src")
+ .include(std::env::var("OUT_DIR").unwrap())
+ .compile("gl_cc");
+}
diff --git a/gfx/wr/swgl/src/composite.h b/gfx/wr/swgl/src/composite.h
new file mode 100644
index 0000000000..a5a4489e6d
--- /dev/null
+++ b/gfx/wr/swgl/src/composite.h
@@ -0,0 +1,922 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+template <typename P>
+static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth,
+ int span, int frac) {
+ for (P* end = dst + span; dst < end; dst++) {
+ *dst = *src;
+ // Step source according to width ratio.
+ for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
+ src++;
+ }
+ }
+}
+
+static NO_INLINE void scale_blit(Texture& srctex, const IntRect& srcReq,
+ int srcZ, Texture& dsttex,
+ const IntRect& dstReq, int dstZ, bool invertY,
+ const IntRect& clipRect) {
+ // Cache scaling ratios
+ int srcWidth = srcReq.width();
+ int srcHeight = srcReq.height();
+ int dstWidth = dstReq.width();
+ int dstHeight = dstReq.height();
+ // Compute valid dest bounds
+ IntRect dstBounds = dsttex.sample_bounds(dstReq);
+ // Compute valid source bounds
+ // Scale source to dest, rounding inward to avoid sampling outside source
+ IntRect srcBounds = srctex.sample_bounds(srcReq, invertY).scale(
+ srcWidth, srcHeight, dstWidth, dstHeight, true);
+ // Limit dest sampling bounds to overlap source bounds
+ dstBounds.intersect(srcBounds);
+ // Compute the clipped bounds, relative to dstBounds.
+ IntRect clippedDest = dstBounds.intersection(clipRect) - dstBounds.origin();
+ // Check if clipped sampling bounds are empty
+ if (clippedDest.is_empty()) {
+ return;
+ }
+ // Compute final source bounds from clamped dest sampling bounds
+ srcBounds =
+ IntRect(dstBounds).scale(dstWidth, dstHeight, srcWidth, srcHeight);
+ // Calculate source and dest pointers from clamped offsets
+ int bpp = srctex.bpp();
+ int srcStride = srctex.stride();
+ int destStride = dsttex.stride();
+ char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ);
+ char* src = srctex.sample_ptr(srcReq, srcBounds, srcZ, invertY);
+ // Inverted Y must step downward along source rows
+ if (invertY) {
+ srcStride = -srcStride;
+ }
+ int span = clippedDest.width();
+ int fracX = srcWidth * clippedDest.x0;
+ int fracY = srcHeight * clippedDest.y0;
+ dest += destStride * clippedDest.y0;
+ dest += bpp * clippedDest.x0;
+ src += srcStride * (fracY / dstHeight);
+ src += bpp * (fracX / dstWidth);
+ fracY %= dstHeight;
+ fracX %= dstWidth;
+ for (int rows = clippedDest.height(); rows > 0; rows--) {
+ if (srcWidth == dstWidth) {
+ // No scaling, so just do a fast copy.
+ memcpy(dest, src, span * bpp);
+ } else {
+ // Do scaling with different source and dest widths.
+ switch (bpp) {
+ case 1:
+ scale_row((uint8_t*)dest, dstWidth, (uint8_t*)src, srcWidth, span,
+ fracX);
+ break;
+ case 2:
+ scale_row((uint16_t*)dest, dstWidth, (uint16_t*)src, srcWidth, span,
+ fracX);
+ break;
+ case 4:
+ scale_row((uint32_t*)dest, dstWidth, (uint32_t*)src, srcWidth, span,
+ fracX);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ }
+ dest += destStride;
+ // Step source according to height ratio.
+ for (fracY += srcHeight; fracY >= dstHeight; fracY -= dstHeight) {
+ src += srcStride;
+ }
+ }
+}
+
+static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV,
+ float srcDU, int srcZOffset,
+ sampler2DArray sampler) {
+ vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+ for (; span >= 4; span -= 4) {
+ auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+ unaligned_store(dest, srcpx);
+ dest += 4;
+ uv.x += 4 * srcDU;
+ }
+ if (span > 0) {
+ auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+ partial_store_span(dest, srcpx, span);
+ }
+}
+
+static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV,
+ float srcDU, int srcZOffset,
+ sampler2DArray sampler) {
+ vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+ for (; span >= 4; span -= 4) {
+ auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+ unaligned_store(dest, srcpx);
+ dest += 4;
+ uv.x += 4 * srcDU;
+ }
+ if (span > 0) {
+ auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+ partial_store_span(dest, srcpx, span);
+ }
+}
+
+static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV,
+ float srcDU, int srcZOffset,
+ sampler2DArray sampler) {
+ vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+ for (; span >= 4; span -= 4) {
+ auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv), srcZOffset);
+ unaligned_store(dest, srcpx);
+ dest += 4;
+ uv.x += 4 * srcDU;
+ }
+ if (span > 0) {
+ auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv), srcZOffset);
+ partial_store_span(dest, srcpx, span);
+ }
+}
+
+static NO_INLINE void linear_blit(Texture& srctex, const IntRect& srcReq,
+ int srcZ, Texture& dsttex,
+ const IntRect& dstReq, int dstZ, bool invertY,
+ const IntRect& clipRect) {
+ assert(srctex.internal_format == GL_RGBA8 ||
+ srctex.internal_format == GL_R8 || srctex.internal_format == GL_RG8);
+ // Compute valid dest bounds
+ IntRect dstBounds = dsttex.sample_bounds(dstReq);
+ dstBounds.intersect(clipRect);
+ // Check if sampling bounds are empty
+ if (dstBounds.is_empty()) {
+ return;
+ }
+ // Initialize sampler for source texture
+ sampler2DArray_impl sampler;
+ init_sampler(&sampler, srctex);
+ init_depth(&sampler, srctex);
+ sampler.filter = TextureFilter::LINEAR;
+ // Compute source UVs
+ int srcZOffset = srcZ * sampler.height_stride;
+ vec2_scalar srcUV(srcReq.x0, srcReq.y0);
+ vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
+ float(srcReq.height()) / dstReq.height());
+ // Inverted Y must step downward along source rows
+ if (invertY) {
+ srcUV.y += srcReq.height();
+ srcDUV.y = -srcDUV.y;
+ }
+ // Skip to clamped source start
+ srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
+ // Scale UVs by lerp precision
+ srcUV = linearQuantize(srcUV, 128);
+ srcDUV *= 128.0f;
+ // Calculate dest pointer from clamped offsets
+ int bpp = dsttex.bpp();
+ int destStride = dsttex.stride();
+ char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ);
+ int span = dstBounds.width();
+ for (int rows = dstBounds.height(); rows > 0; rows--) {
+ switch (bpp) {
+ case 1:
+ linear_row_blit((uint8_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+ &sampler);
+ break;
+ case 2:
+ linear_row_blit((uint16_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+ &sampler);
+ break;
+ case 4:
+ linear_row_blit((uint32_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+ &sampler);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ dest += destStride;
+ srcUV.y += srcDUV.y;
+ }
+}
+
+static void linear_row_composite(uint32_t* dest, int span,
+ const vec2_scalar& srcUV, float srcDU,
+ sampler2D sampler) {
+ vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+ for (; span >= 4; span -= 4) {
+ WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv), 0);
+ WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+ PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+ unaligned_store(dest, r);
+
+ dest += 4;
+ uv.x += 4 * srcDU;
+ }
+ if (span > 0) {
+ WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv), 0);
+ WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, span));
+ PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+ partial_store_span(dest, r, span);
+ }
+}
+
+static NO_INLINE void linear_composite(Texture& srctex, const IntRect& srcReq,
+ Texture& dsttex, const IntRect& dstReq,
+ bool invertY, const IntRect& clipRect) {
+ assert(srctex.bpp() == 4);
+ assert(dsttex.bpp() == 4);
+ // Compute valid dest bounds
+ IntRect dstBounds = dsttex.sample_bounds(dstReq);
+ dstBounds.intersect(clipRect);
+ // Check if sampling bounds are empty
+ if (dstBounds.is_empty()) {
+ return;
+ }
+ // Initialize sampler for source texture
+ sampler2D_impl sampler;
+ init_sampler(&sampler, srctex);
+ sampler.filter = TextureFilter::LINEAR;
+ // Compute source UVs
+ vec2_scalar srcUV(srcReq.x0, srcReq.y0);
+ vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
+ float(srcReq.height()) / dstReq.height());
+ // Inverted Y must step downward along source rows
+ if (invertY) {
+ srcUV.y += srcReq.height();
+ srcDUV.y = -srcDUV.y;
+ }
+ // Skip to clamped source start
+ srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
+ // Scale UVs by lerp precision
+ srcUV = linearQuantize(srcUV, 128);
+ srcDUV *= 128.0f;
+ // Calculate dest pointer from clamped offsets
+ int destStride = dsttex.stride();
+ char* dest = dsttex.sample_ptr(dstReq, dstBounds, 0);
+ int span = dstBounds.width();
+ for (int rows = dstBounds.height(); rows > 0; rows--) {
+ linear_row_composite((uint32_t*)dest, span, srcUV, srcDUV.x, &sampler);
+ dest += destStride;
+ srcUV.y += srcDUV.y;
+ }
+}
+
+extern "C" {
+
+void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+ GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+ GLbitfield mask, GLenum filter) {
+ assert(mask == GL_COLOR_BUFFER_BIT);
+ Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER);
+ if (!srcfb || srcfb->layer < 0) return;
+ Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER);
+ if (!dstfb || dstfb->layer < 0) return;
+ Texture& srctex = ctx->textures[srcfb->color_attachment];
+ if (!srctex.buf || srcfb->layer >= max(srctex.depth, 1)) return;
+ Texture& dsttex = ctx->textures[dstfb->color_attachment];
+ if (!dsttex.buf || dstfb->layer >= max(dsttex.depth, 1)) return;
+ assert(!dsttex.locked);
+ if (srctex.internal_format != dsttex.internal_format) {
+ assert(false);
+ return;
+ }
+ // Force flipped Y onto dest coordinates
+ if (srcY1 < srcY0) {
+ swap(srcY0, srcY1);
+ swap(dstY0, dstY1);
+ }
+ bool invertY = dstY1 < dstY0;
+ if (invertY) {
+ swap(dstY0, dstY1);
+ }
+ IntRect srcReq = IntRect{srcX0, srcY0, srcX1, srcY1} - srctex.offset;
+ IntRect dstReq = IntRect{dstX0, dstY0, dstX1, dstY1} - dsttex.offset;
+ if (srcReq.is_empty() || dstReq.is_empty()) {
+ return;
+ }
+ IntRect clipRect = {0, 0, dstReq.width(), dstReq.height()};
+ prepare_texture(srctex);
+ prepare_texture(dsttex, &dstReq);
+ if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR &&
+ (srctex.internal_format == GL_RGBA8 || srctex.internal_format == GL_R8 ||
+ srctex.internal_format == GL_RG8)) {
+ linear_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+ invertY, dstReq);
+ } else {
+ scale_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+ invertY, clipRect);
+ }
+}
+
+typedef Texture LockedTexture;
+
+// Lock the given texture to prevent modification.
+LockedTexture* LockTexture(GLuint texId) {
+ Texture& tex = ctx->textures[texId];
+ if (!tex.buf) {
+ assert(tex.buf != nullptr);
+ return nullptr;
+ }
+ if (__sync_fetch_and_add(&tex.locked, 1) == 0) {
+ // If this is the first time locking the texture, flush any delayed clears.
+ prepare_texture(tex);
+ }
+ return (LockedTexture*)&tex;
+}
+
+// Lock the given framebuffer's color attachment to prevent modification.
+LockedTexture* LockFramebuffer(GLuint fboId) {
+ Framebuffer& fb = ctx->framebuffers[fboId];
+ // Only allow locking a framebuffer if it has a valid color attachment and
+ // only if targeting the first layer.
+ if (!fb.color_attachment || fb.layer > 0) {
+ assert(fb.color_attachment != 0);
+ assert(fb.layer == 0);
+ return nullptr;
+ }
+ return LockTexture(fb.color_attachment);
+}
+
+// Reference an already locked resource
+void LockResource(LockedTexture* resource) {
+ if (!resource) {
+ return;
+ }
+ __sync_fetch_and_add(&resource->locked, 1);
+}
+
+// Remove a lock on a texture that has been previously locked
+void UnlockResource(LockedTexture* resource) {
+ if (!resource) {
+ return;
+ }
+ if (__sync_fetch_and_add(&resource->locked, -1) <= 0) {
+ // The lock should always be non-zero before unlocking.
+ assert(0);
+ }
+}
+
+// Get the underlying buffer for a locked resource
+void* GetResourceBuffer(LockedTexture* resource, int32_t* width,
+ int32_t* height, int32_t* stride) {
+ *width = resource->width;
+ *height = resource->height;
+ *stride = resource->stride();
+ return resource->buf;
+}
+
+static void unscaled_row_composite(uint32_t* dest, const uint32_t* src,
+ int span) {
+ const uint32_t* end = src + span;
+ while (src + 4 <= end) {
+ WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+ WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+ PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+ unaligned_store(dest, r);
+ src += 4;
+ dest += 4;
+ }
+ if (src < end) {
+ WideRGBA8 srcpx = unpack(partial_load_span<PackedRGBA8>(src, end - src));
+ WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, end - src));
+ auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+ partial_store_span(dest, r, end - src);
+ }
+}
+
+static NO_INLINE void unscaled_composite(Texture& srctex, const IntRect& srcReq,
+ Texture& dsttex, const IntRect& dstReq,
+ bool invertY,
+ const IntRect& clipRect) {
+ IntRect bounds = dsttex.sample_bounds(dstReq);
+ bounds.intersect(clipRect);
+ bounds.intersect(srctex.sample_bounds(srcReq, invertY));
+ char* dest = dsttex.sample_ptr(dstReq, bounds, 0);
+ char* src = srctex.sample_ptr(srcReq, bounds, 0, invertY);
+ int srcStride = srctex.stride();
+ int destStride = dsttex.stride();
+ if (invertY) {
+ srcStride = -srcStride;
+ }
+ for (int rows = bounds.height(); rows > 0; rows--) {
+ unscaled_row_composite((uint32_t*)dest, (const uint32_t*)src,
+ bounds.width());
+ dest += destStride;
+ src += srcStride;
+ }
+}
+
+// Extension for optimized compositing of textures or framebuffers that may be
+// safely used across threads. The source and destination must be locked to
+// ensure that they can be safely accessed while the SWGL context might be used
+// by another thread. Band extents along the Y axis may be used to clip the
+// destination rectangle without effecting the integer scaling ratios.
+void Composite(LockedTexture* lockedDst, LockedTexture* lockedSrc, GLint srcX,
+ GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
+ GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
+ GLboolean opaque, GLboolean flip, GLenum filter, GLint clipX,
+ GLint clipY, GLsizei clipWidth, GLsizei clipHeight) {
+ if (!lockedDst || !lockedSrc) {
+ return;
+ }
+ Texture& srctex = *lockedSrc;
+ Texture& dsttex = *lockedDst;
+ assert(srctex.bpp() == 4);
+ assert(dsttex.bpp() == 4);
+
+ IntRect srcReq =
+ IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - srctex.offset;
+ IntRect dstReq =
+ IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
+ // Compute clip rect as relative to the dstReq, as that's the same coords
+ // as used for the sampling bounds.
+ IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
+ clipY - dstY + clipHeight};
+
+ if (opaque) {
+ // Ensure we have rows of at least 2 pixels when using the linear filter
+ // to avoid overreading the row.
+ if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) {
+ linear_blit(srctex, srcReq, 0, dsttex, dstReq, 0, flip, clipRect);
+ } else {
+ scale_blit(srctex, srcReq, 0, dsttex, dstReq, 0, flip, clipRect);
+ }
+ } else {
+ if (!srcReq.same_size(dstReq) && srctex.width >= 2) {
+ linear_composite(srctex, srcReq, dsttex, dstReq, flip, clipRect);
+ } else {
+ unscaled_composite(srctex, srcReq, dsttex, dstReq, flip, clipRect);
+ }
+ }
+}
+
+} // extern "C"
+
+// Saturated add helper for YUV conversion. Supported platforms have intrinsics
+// to do this natively, but support a slower generic fallback just in case.
+static inline V8<int16_t> addsat(V8<int16_t> x, V8<int16_t> y) {
+#if USE_SSE2
+ return _mm_adds_epi16(x, y);
+#elif USE_NEON
+ return vqaddq_s16(x, y);
+#else
+ auto r = x + y;
+ // An overflow occurred if the signs of both inputs x and y did not differ
+ // but yet the sign of the result did differ.
+ auto overflow = (~(x ^ y) & (r ^ x)) >> 15;
+ // If there was an overflow, we need to choose the appropriate limit to clamp
+ // to depending on whether or not the inputs are negative.
+ auto limit = (x >> 15) ^ 0x7FFF;
+ // If we didn't overflow, just use the result, and otherwise, use the limit.
+ return (~overflow & r) | (overflow & limit);
+#endif
+}
+
+// Interleave and packing helper for YUV conversion. During transform by the
+// color matrix, the color components are de-interleaved as this format is
+// usually what comes out of the planar YUV textures. The components thus need
+// to be interleaved before finally getting packed to BGRA format. Alpha is
+// forced to be opaque.
+static inline PackedRGBA8 packYUV(V8<int16_t> gg, V8<int16_t> br) {
+ return pack(bit_cast<WideRGBA8>(zip(br, gg))) |
+ PackedRGBA8{0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+}
+
+enum YUVColorSpace { REC_601 = 0, REC_709, REC_2020, IDENTITY };
+
+// clang-format off
+// Supports YUV color matrixes of the form:
+// [R] [1.1643835616438356, 0.0, rv ] [Y - 16]
+// [G] = [1.1643835616438358, -gu, -gv ] x [U - 128]
+// [B] [1.1643835616438356, bu, 0.0 ] [V - 128]
+// We must be able to multiply a YUV input by a matrix coefficient ranging as
+// high as ~2.2 in the U/V cases, where U/V can be signed values between -128
+// and 127. The largest fixed-point representation we can thus support without
+// overflowing 16 bit integers leaves us 6 bits of fractional precision while
+// also supporting a sign bit. The closest representation of the Y coefficient
+// ~1.164 in this precision is 74.5/2^6 which is common to all color spaces
+// we support. Conversions can still sometimes overflow the precision and
+// require clamping back into range, so we use saturated additions to do this
+// efficiently at no extra cost.
+// clang-format on
+template <const double MATRIX[4]>
+struct YUVConverterImpl {
+ static inline PackedRGBA8 convert(V8<int16_t> yy, V8<int16_t> uv) {
+ // Convert matrix coefficients to fixed-point representation.
+ constexpr int16_t mrv = int16_t(MATRIX[0] * 64.0 + 0.5);
+ constexpr int16_t mgu = -int16_t(MATRIX[1] * -64.0 + 0.5);
+ constexpr int16_t mgv = -int16_t(MATRIX[2] * -64.0 + 0.5);
+ constexpr int16_t mbu = int16_t(MATRIX[3] * 64.0 + 0.5);
+
+ // Bias Y values by -16 and multiply by 74.5. Add 2^5 offset to round to
+ // nearest 2^6.
+ yy = yy * 74 + (yy >> 1) + (int16_t(-16 * 74.5) + (1 << 5));
+
+ // Bias U/V values by -128.
+ uv -= 128;
+
+ // Compute (R, B) = (74.5*Y + rv*V, 74.5*Y + bu*U)
+ auto br = V8<int16_t>{mbu, mrv, mbu, mrv, mbu, mrv, mbu, mrv} * uv;
+ br = addsat(yy, br);
+ br >>= 6;
+
+ // Compute G = 74.5*Y + -gu*U + -gv*V
+ auto gg = V8<int16_t>{mgu, mgv, mgu, mgv, mgu, mgv, mgu, mgv} * uv;
+ gg = addsat(
+ yy,
+ addsat(gg, bit_cast<V8<int16_t>>(bit_cast<V4<uint32_t>>(gg) >> 16)));
+ gg >>= 6;
+
+ // Interleave B/R and G values. Force alpha to opaque.
+ return packYUV(gg, br);
+ }
+};
+
+template <YUVColorSpace COLOR_SPACE>
+struct YUVConverter {};
+
+// clang-format off
+// From Rec601:
+// [R] [1.1643835616438356, 0.0, 1.5960267857142858 ] [Y - 16]
+// [G] = [1.1643835616438358, -0.3917622900949137, -0.8129676472377708 ] x [U - 128]
+// [B] [1.1643835616438356, 2.017232142857143, 8.862867620416422e-17] [V - 128]
+// clang-format on
+constexpr double YUVMatrix601[4] = {1.5960267857142858, -0.3917622900949137,
+ -0.8129676472377708, 2.017232142857143};
+template <>
+struct YUVConverter<REC_601> : YUVConverterImpl<YUVMatrix601> {};
+
+// clang-format off
+// From Rec709:
+// [R] [1.1643835616438356, 0.0, 1.7927410714285714] [Y - 16]
+// [G] = [1.1643835616438358, -0.21324861427372963, -0.532909328559444 ] x [U - 128]
+// [B] [1.1643835616438356, 2.1124017857142854, 0.0 ] [V - 128]
+// clang-format on
+static constexpr double YUVMatrix709[4] = {
+ 1.7927410714285714, -0.21324861427372963, -0.532909328559444,
+ 2.1124017857142854};
+template <>
+struct YUVConverter<REC_709> : YUVConverterImpl<YUVMatrix709> {};
+
+// clang-format off
+// From Re2020:
+// [R] [1.16438356164384, 0.0, 1.678674107142860 ] [Y - 16]
+// [G] = [1.16438356164384, -0.187326104219343, -0.650424318505057 ] x [U - 128]
+// [B] [1.16438356164384, 2.14177232142857, 0.0 ] [V - 128]
+// clang-format on
+static constexpr double YUVMatrix2020[4] = {
+ 1.678674107142860, -0.187326104219343, -0.650424318505057,
+ 2.14177232142857};
+template <>
+struct YUVConverter<REC_2020> : YUVConverterImpl<YUVMatrix2020> {};
+
+// clang-format off
+// [R] [V]
+// [G] = [Y]
+// [B] [U]
+// clang-format on
+template <>
+struct YUVConverter<IDENTITY> {
+ static inline PackedRGBA8 convert(V8<int16_t> y, V8<int16_t> uv) {
+ // Map U/V directly to B/R and map Y directly to G with opaque alpha.
+ return packYUV(y, uv);
+ }
+};
+
+// Helper function for textureLinearRowR8 that samples horizontal taps and
+// combines them based on Y fraction with next row.
+template <typename S>
+static ALWAYS_INLINE V8<int16_t> linearRowTapsR8(S sampler, I32 ix,
+ int32_t offsety,
+ int32_t stridey,
+ int16_t fracy) {
+ uint8_t* buf = (uint8_t*)sampler->buf + offsety;
+ auto a0 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
+ auto b0 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
+ auto c0 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
+ auto d0 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
+ auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8<int16_t>);
+ buf += stridey;
+ auto a1 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
+ auto b1 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
+ auto c1 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
+ auto d1 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
+ auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8<int16_t>);
+ abcd0 += ((abcd1 - abcd0) * fracy) >> 7;
+ return abcd0;
+}
+
+// Optimized version of textureLinearPackedR8 for Y R8 texture. This assumes
+// constant Y and returns a duplicate of the result interleaved with itself
+// to aid in later YUV transformation.
+template <typename S>
+static inline V8<int16_t> textureLinearRowR8(S sampler, I32 ix, int32_t offsety,
+ int32_t stridey, int16_t fracy) {
+ assert(sampler->format == TextureFormat::R8);
+
+ // Calculate X fraction and clamp X offset into range.
+ I32 fracx = ix;
+ ix >>= 7;
+ fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
+ ix = clampCoord(ix, sampler->width - 1);
+
+ // Load the sample taps and combine rows.
+ auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
+
+ // Unzip the result and do final horizontal multiply-add base on X fraction.
+ auto abcdl = SHUFFLE(abcd, abcd, 0, 0, 2, 2, 4, 4, 6, 6);
+ auto abcdh = SHUFFLE(abcd, abcd, 1, 1, 3, 3, 5, 5, 7, 7);
+ abcdl += ((abcdh - abcdl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
+
+ // The final result is the packed values interleaved with a duplicate of
+ // themselves.
+ return abcdl;
+}
+
+// Optimized version of textureLinearPackedR8 for paired U/V R8 textures.
+// Since the two textures have the same dimensions and stride, the addressing
+// math can be shared between both samplers. This also allows a coalesced
+// multiply in the final stage by packing both U/V results into a single
+// operation.
+template <typename S>
+static inline V8<int16_t> textureLinearRowPairedR8(S sampler, S sampler2,
+ I32 ix, int32_t offsety,
+ int32_t stridey,
+ int16_t fracy) {
+ assert(sampler->format == TextureFormat::R8 &&
+ sampler2->format == TextureFormat::R8);
+ assert(sampler->width == sampler2->width &&
+ sampler->height == sampler2->height);
+ assert(sampler->stride == sampler2->stride);
+
+ // Calculate X fraction and clamp X offset into range.
+ I32 fracx = ix;
+ ix >>= 7;
+ fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
+ ix = clampCoord(ix, sampler->width - 1);
+
+ // Load the sample taps for the first sampler and combine rows.
+ auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
+
+ // Load the sample taps for the second sampler and combine rows.
+ auto xyzw = linearRowTapsR8(sampler2, ix, offsety, stridey, fracy);
+
+ // We are left with a result vector for each sampler with values for adjacent
+ // pixels interleaved together in each. We need to unzip these values so that
+ // we can do the final horizontal multiply-add based on the X fraction.
+ auto abcdxyzwl = SHUFFLE(abcd, xyzw, 0, 8, 2, 10, 4, 12, 6, 14);
+ auto abcdxyzwh = SHUFFLE(abcd, xyzw, 1, 9, 3, 11, 5, 13, 7, 15);
+ abcdxyzwl += ((abcdxyzwh - abcdxyzwl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
+
+ // The final result is the packed values for the first sampler interleaved
+ // with the packed values for the second sampler.
+ return abcdxyzwl;
+}
+
+template <YUVColorSpace COLOR_SPACE>
+static void linear_row_yuv(uint32_t* dest, int span, const vec2_scalar& srcUV,
+ float srcDU, const vec2_scalar& chromaUV,
+ float chromaDU, sampler2D_impl sampler[3],
+ int colorDepth) {
+ // Casting to int loses some precision while stepping that can offset the
+ // image, so shift the values by some extra bits of precision to minimize
+ // this. We support up to 16 bits of image size, 7 bits of quantization,
+ // and 1 bit for sign, which leaves 8 bits left for extra precision.
+ const int STEP_BITS = 8;
+
+ // Calculate varying and constant interp data for Y plane.
+ I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS));
+ int32_t yV = int32_t(srcUV.y);
+
+ // Calculate varying and constant interp data for chroma planes.
+ I32 cU = cast(init_interp(chromaUV.x, chromaDU) * (1 << STEP_BITS));
+ int32_t cV = int32_t(chromaUV.y);
+
+ // We need to skip 4 pixels per chunk.
+ int32_t yDU = int32_t((4 << STEP_BITS) * srcDU);
+ int32_t cDU = int32_t((4 << STEP_BITS) * chromaDU);
+
+ if (sampler[0].width < 2 || sampler[1].width < 2) {
+ // If the source row has less than 2 pixels, it's not safe to use a linear
+ // filter because it may overread the row. Just convert the single pixel
+ // with nearest filtering and fill the row with it.
+ I16 yuv =
+ CONVERT(round_pixel((Float){
+ texelFetch(&sampler[0], ivec2(srcUV), 0).x.x,
+ texelFetch(&sampler[1], ivec2(chromaUV), 0).x.x,
+ texelFetch(&sampler[2], ivec2(chromaUV), 0).x.x, 1.0f}),
+ I16);
+ auto rgb = YUVConverter<COLOR_SPACE>::convert(zip(I16(yuv.x), I16(yuv.x)),
+ zip(I16(yuv.y), I16(yuv.z)));
+ for (; span >= 4; span -= 4) {
+ unaligned_store(dest, rgb);
+ dest += 4;
+ }
+ if (span > 0) {
+ partial_store_span(dest, rgb, span);
+ }
+ } else if (sampler[0].format == TextureFormat::R16) {
+ // Sample each YUV plane, rescale it to fit in low 8 bits of word, and then
+ // transform them by the appropriate color space.
+ assert(colorDepth > 8);
+ // Need to right shift the sample by the amount of bits over 8 it occupies.
+ // On output from textureLinearUnpackedR16, we have lost 1 bit of precision
+ // at the low end already, hence 1 is subtracted from the color depth.
+ int rescaleBits = (colorDepth - 1) - 8;
+ for (; span >= 4; span -= 4) {
+ auto yPx =
+ textureLinearUnpackedR16(&sampler[0], ivec2(yU >> STEP_BITS, yV)) >>
+ rescaleBits;
+ auto uPx =
+ textureLinearUnpackedR16(&sampler[1], ivec2(cU >> STEP_BITS, cV)) >>
+ rescaleBits;
+ auto vPx =
+ textureLinearUnpackedR16(&sampler[2], ivec2(cU >> STEP_BITS, cV)) >>
+ rescaleBits;
+ unaligned_store(dest, YUVConverter<COLOR_SPACE>::convert(zip(yPx, yPx),
+ zip(uPx, vPx)));
+ dest += 4;
+ yU += yDU;
+ cU += cDU;
+ }
+ if (span > 0) {
+ // Handle any remaining pixels...
+ auto yPx =
+ textureLinearUnpackedR16(&sampler[0], ivec2(yU >> STEP_BITS, yV)) >>
+ rescaleBits;
+ auto uPx =
+ textureLinearUnpackedR16(&sampler[1], ivec2(cU >> STEP_BITS, cV)) >>
+ rescaleBits;
+ auto vPx =
+ textureLinearUnpackedR16(&sampler[2], ivec2(cU >> STEP_BITS, cV)) >>
+ rescaleBits;
+ partial_store_span(
+ dest,
+ YUVConverter<COLOR_SPACE>::convert(zip(yPx, yPx), zip(uPx, vPx)),
+ span);
+ }
+ } else {
+ assert(sampler[0].format == TextureFormat::R8);
+ assert(colorDepth == 8);
+
+ // Calculate varying and constant interp data for Y plane.
+ int16_t yFracV = yV & 0x7F;
+ yV >>= 7;
+ int32_t yOffsetV = clampCoord(yV, sampler[0].height) * sampler[0].stride;
+ int32_t yStrideV =
+ yV >= 0 && yV < int32_t(sampler[0].height) - 1 ? sampler[0].stride : 0;
+
+ // Calculate varying and constant interp data for chroma planes.
+ int16_t cFracV = cV & 0x7F;
+ cV >>= 7;
+ int32_t cOffsetV = clampCoord(cV, sampler[1].height) * sampler[1].stride;
+ int32_t cStrideV =
+ cV >= 0 && cV < int32_t(sampler[1].height) - 1 ? sampler[1].stride : 0;
+
+ for (; span >= 4; span -= 4) {
+ // Sample each YUV plane and then transform them by the appropriate color
+ // space.
+ auto yPx = textureLinearRowR8(&sampler[0], yU >> STEP_BITS, yOffsetV,
+ yStrideV, yFracV);
+ auto uvPx =
+ textureLinearRowPairedR8(&sampler[1], &sampler[2], cU >> STEP_BITS,
+ cOffsetV, cStrideV, cFracV);
+ unaligned_store(dest, YUVConverter<COLOR_SPACE>::convert(yPx, uvPx));
+ dest += 4;
+ yU += yDU;
+ cU += cDU;
+ }
+ if (span > 0) {
+ // Handle any remaining pixels...
+ auto yPx = textureLinearRowR8(&sampler[0], yU >> STEP_BITS, yOffsetV,
+ yStrideV, yFracV);
+ auto uvPx =
+ textureLinearRowPairedR8(&sampler[1], &sampler[2], cU >> STEP_BITS,
+ cOffsetV, cStrideV, cFracV);
+ partial_store_span(dest, YUVConverter<COLOR_SPACE>::convert(yPx, uvPx),
+ span);
+ }
+ }
+}
+
+static void linear_convert_yuv(Texture& ytex, Texture& utex, Texture& vtex,
+ YUVColorSpace colorSpace, int colorDepth,
+ const IntRect& srcReq, Texture& dsttex,
+ const IntRect& dstReq, bool invertY,
+ const IntRect& clipRect) {
+ // Compute valid dest bounds
+ IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
+ dstBounds.intersect(clipRect);
+ // Check if sampling bounds are empty
+ if (dstBounds.is_empty()) {
+ return;
+ }
+ // Initialize samplers for source textures
+ sampler2D_impl sampler[3];
+ init_sampler(&sampler[0], ytex);
+ init_sampler(&sampler[1], utex);
+ init_sampler(&sampler[2], vtex);
+
+ // Compute source UVs
+ vec2_scalar srcUV(srcReq.x0, srcReq.y0);
+ vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
+ float(srcReq.height()) / dstReq.height());
+ // Inverted Y must step downward along source rows
+ if (invertY) {
+ srcUV.y += srcReq.height();
+ srcDUV.y = -srcDUV.y;
+ }
+ // Skip to clamped source start
+ srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
+ // Calculate separate chroma UVs for chroma planes with different scale
+ vec2_scalar chromaScale(float(utex.width) / ytex.width,
+ float(utex.height) / ytex.height);
+ vec2_scalar chromaUV = srcUV * chromaScale;
+ vec2_scalar chromaDUV = srcDUV * chromaScale;
+ // Scale UVs by lerp precision. If the row has only 1 pixel, then don't
+ // quantize so that we can use nearest filtering instead to avoid overreads.
+ if (ytex.width >= 2 && utex.width >= 2) {
+ srcUV = linearQuantize(srcUV, 128);
+ srcDUV *= 128.0f;
+ chromaUV = linearQuantize(chromaUV, 128);
+ chromaDUV *= 128.0f;
+ }
+ // Calculate dest pointer from clamped offsets
+ int destStride = dsttex.stride();
+ char* dest = dsttex.sample_ptr(dstReq, dstBounds, 0);
+ int span = dstBounds.width();
+ for (int rows = dstBounds.height(); rows > 0; rows--) {
+ switch (colorSpace) {
+ case REC_601:
+ linear_row_yuv<REC_601>((uint32_t*)dest, span, srcUV, srcDUV.x,
+ chromaUV, chromaDUV.x, sampler, colorDepth);
+ break;
+ case REC_709:
+ linear_row_yuv<REC_709>((uint32_t*)dest, span, srcUV, srcDUV.x,
+ chromaUV, chromaDUV.x, sampler, colorDepth);
+ break;
+ case REC_2020:
+ linear_row_yuv<REC_2020>((uint32_t*)dest, span, srcUV, srcDUV.x,
+ chromaUV, chromaDUV.x, sampler, colorDepth);
+ break;
+ case IDENTITY:
+ linear_row_yuv<IDENTITY>((uint32_t*)dest, span, srcUV, srcDUV.x,
+ chromaUV, chromaDUV.x, sampler, colorDepth);
+ break;
+ default:
+ debugf("unknown YUV color space %d\n", colorSpace);
+ assert(false);
+ break;
+ }
+ dest += destStride;
+ srcUV.y += srcDUV.y;
+ chromaUV.y += chromaDUV.y;
+ }
+}
+
+extern "C" {
+
+// Extension for compositing a YUV surface represented by separate YUV planes
+// to a BGRA destination. The supplied color space is used to determine the
+// transform from YUV to BGRA after sampling.
+void CompositeYUV(LockedTexture* lockedDst, LockedTexture* lockedY,
+ LockedTexture* lockedU, LockedTexture* lockedV,
+ YUVColorSpace colorSpace, GLuint colorDepth, GLint srcX,
+ GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
+ GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
+ GLboolean flip, GLint clipX, GLint clipY, GLsizei clipWidth,
+ GLsizei clipHeight) {
+ if (!lockedDst || !lockedY || !lockedU || !lockedV) {
+ return;
+ }
+ Texture& ytex = *lockedY;
+ Texture& utex = *lockedU;
+ Texture& vtex = *lockedV;
+ Texture& dsttex = *lockedDst;
+ // All YUV planes must currently be represented by R8 or R16 textures.
+ // The chroma (U/V) planes must have matching dimensions.
+ assert(ytex.bpp() == utex.bpp() && ytex.bpp() == vtex.bpp());
+ assert((ytex.bpp() == 1 && colorDepth == 8) ||
+ (ytex.bpp() == 2 && colorDepth > 8));
+ // assert(ytex.width == utex.width && ytex.height == utex.height);
+ assert(utex.width == vtex.width && utex.height == vtex.height);
+ assert(ytex.offset == utex.offset && ytex.offset == vtex.offset);
+ assert(dsttex.bpp() == 4);
+
+ IntRect srcReq =
+ IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - ytex.offset;
+ IntRect dstReq =
+ IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
+ // Compute clip rect as relative to the dstReq, as that's the same coords
+ // as used for the sampling bounds.
+ IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
+ clipY - dstY + clipHeight};
+ // For now, always use a linear filter path that would be required for
+ // scaling. Further fast-paths for non-scaled video might be desirable in the
+ // future.
+ linear_convert_yuv(ytex, utex, vtex, colorSpace, colorDepth, srcReq, dsttex,
+ dstReq, flip, clipRect);
+}
+
+} // extern "C"
diff --git a/gfx/wr/swgl/src/gl.cc b/gfx/wr/swgl/src/gl.cc
new file mode 100644
index 0000000000..370d243b9a
--- /dev/null
+++ b/gfx/wr/swgl/src/gl.cc
@@ -0,0 +1,4445 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef __MACH__
+# include <mach/mach.h>
+# include <mach/mach_time.h>
+#else
+# include <time.h>
+#endif
+
+#ifdef NDEBUG
+# define debugf(...)
+#else
+# define debugf(...) printf(__VA_ARGS__)
+#endif
+
+// #define PRINT_TIMINGS
+
+#ifdef _WIN32
+# define ALWAYS_INLINE __forceinline
+# define NO_INLINE __declspec(noinline)
+
+// Including Windows.h brings a huge amount of namespace polution so just
+// define a couple of things manually
+typedef int BOOL;
+# define WINAPI __stdcall
+# define DECLSPEC_IMPORT __declspec(dllimport)
+# define WINBASEAPI DECLSPEC_IMPORT
+typedef unsigned long DWORD;
+typedef long LONG;
+typedef __int64 LONGLONG;
+# define DUMMYSTRUCTNAME
+
+typedef union _LARGE_INTEGER {
+ struct {
+ DWORD LowPart;
+ LONG HighPart;
+ } DUMMYSTRUCTNAME;
+ struct {
+ DWORD LowPart;
+ LONG HighPart;
+ } u;
+ LONGLONG QuadPart;
+} LARGE_INTEGER;
+extern "C" {
+WINBASEAPI BOOL WINAPI
+QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount);
+
+WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency);
+}
+
+#else
+# define ALWAYS_INLINE __attribute__((always_inline)) inline
+# define NO_INLINE __attribute__((noinline))
+#endif
+
+#define UNREACHABLE __builtin_unreachable()
+
+#define UNUSED __attribute__((unused))
+
+#define FALLTHROUGH __attribute__((fallthrough))
+
+#ifdef MOZILLA_CLIENT
+# define IMPLICIT __attribute__((annotate("moz_implicit")))
+#else
+# define IMPLICIT
+#endif
+
+#include "gl_defs.h"
+#include "glsl.h"
+#include "program.h"
+#include "texture.h"
+
+using namespace glsl;
+
+typedef ivec2_scalar IntPoint;
+
+struct IntRect {
+ int x0;
+ int y0;
+ int x1;
+ int y1;
+
+ IntRect() : x0(0), y0(0), x1(0), y1(0) {}
+ IntRect(int x0, int y0, int x1, int y1) : x0(x0), y0(y0), x1(x1), y1(y1) {}
+ IntRect(IntPoint origin, IntPoint size)
+ : x0(origin.x),
+ y0(origin.y),
+ x1(origin.x + size.x),
+ y1(origin.y + size.y) {}
+
+ int width() const { return x1 - x0; }
+ int height() const { return y1 - y0; }
+ bool is_empty() const { return width() <= 0 || height() <= 0; }
+
+ IntPoint origin() const { return IntPoint(x0, y0); }
+
+ bool same_size(const IntRect& o) const {
+ return width() == o.width() && height() == o.height();
+ }
+
+ bool contains(const IntRect& o) const {
+ return o.x0 >= x0 && o.y0 >= y0 && o.x1 <= x1 && o.y1 <= y1;
+ }
+
+ IntRect& intersect(const IntRect& o) {
+ x0 = max(x0, o.x0);
+ y0 = max(y0, o.y0);
+ x1 = min(x1, o.x1);
+ y1 = min(y1, o.y1);
+ return *this;
+ }
+
+ IntRect intersection(const IntRect& o) {
+ IntRect result = *this;
+ result.intersect(o);
+ return result;
+ }
+
+ // Scale from source-space to dest-space, optionally rounding inward
+ IntRect& scale(int srcWidth, int srcHeight, int dstWidth, int dstHeight,
+ bool roundIn = false) {
+ x0 = (x0 * dstWidth + (roundIn ? srcWidth - 1 : 0)) / srcWidth;
+ y0 = (y0 * dstHeight + (roundIn ? srcHeight - 1 : 0)) / srcHeight;
+ x1 = (x1 * dstWidth) / srcWidth;
+ y1 = (y1 * dstHeight) / srcHeight;
+ return *this;
+ }
+
+ // Flip the rect's Y coords around inflection point at Y=offset
+ void invert_y(int offset) {
+ y0 = offset - y0;
+ y1 = offset - y1;
+ swap(y0, y1);
+ }
+
+ IntRect& offset(const IntPoint& o) {
+ x0 += o.x;
+ y0 += o.y;
+ x1 += o.x;
+ y1 += o.y;
+ return *this;
+ }
+
+ IntRect operator+(const IntPoint& o) const {
+ return IntRect(*this).offset(o);
+ }
+ IntRect operator-(const IntPoint& o) const {
+ return IntRect(*this).offset(-o);
+ }
+};
+
+struct VertexAttrib {
+ size_t size = 0; // in bytes
+ GLenum type = 0;
+ bool normalized = false;
+ GLsizei stride = 0;
+ GLuint offset = 0;
+ bool enabled = false;
+ GLuint divisor = 0;
+ int vertex_array = 0;
+ int vertex_buffer = 0;
+ char* buf = nullptr; // XXX: this can easily dangle
+ size_t buf_size = 0; // this will let us bounds check
+};
+
+static int bytes_for_internal_format(GLenum internal_format) {
+ switch (internal_format) {
+ case GL_RGBA32F:
+ return 4 * 4;
+ case GL_RGBA32I:
+ return 4 * 4;
+ case GL_RGBA8:
+ case GL_BGRA8:
+ case GL_RGBA:
+ return 4;
+ case GL_R8:
+ case GL_RED:
+ return 1;
+ case GL_RG8:
+ case GL_RG:
+ return 2;
+ case GL_DEPTH_COMPONENT:
+ case GL_DEPTH_COMPONENT16:
+ case GL_DEPTH_COMPONENT24:
+ case GL_DEPTH_COMPONENT32:
+ return 4;
+ case GL_RGB_RAW_422_APPLE:
+ return 2;
+ case GL_R16:
+ return 2;
+ default:
+ debugf("internal format: %x\n", internal_format);
+ assert(0);
+ return 0;
+ }
+}
+
+static inline int aligned_stride(int row_bytes) { return (row_bytes + 3) & ~3; }
+
+static TextureFormat gl_format_to_texture_format(int type) {
+ switch (type) {
+ case GL_RGBA32F:
+ return TextureFormat::RGBA32F;
+ case GL_RGBA32I:
+ return TextureFormat::RGBA32I;
+ case GL_RGBA8:
+ return TextureFormat::RGBA8;
+ case GL_R8:
+ return TextureFormat::R8;
+ case GL_RG8:
+ return TextureFormat::RG8;
+ case GL_R16:
+ return TextureFormat::R16;
+ case GL_RGB_RAW_422_APPLE:
+ return TextureFormat::YUV422;
+ default:
+ assert(0);
+ return TextureFormat::RGBA8;
+ }
+}
+
+struct Query {
+ uint64_t value = 0;
+};
+
+struct Buffer {
+ char* buf = nullptr;
+ size_t size = 0;
+ size_t capacity = 0;
+
+ bool allocate(size_t new_size) {
+ // If the size remains unchanged, don't allocate anything.
+ if (new_size == size) {
+ return false;
+ }
+ // If the new size is within the existing capacity of the buffer, just
+ // reuse the existing buffer.
+ if (new_size <= capacity) {
+ size = new_size;
+ return true;
+ }
+ // Otherwise we need to reallocate the buffer to hold up to the requested
+ // larger size.
+ char* new_buf = (char*)realloc(buf, new_size);
+ assert(new_buf);
+ if (!new_buf) {
+ // If we fail, null out the buffer rather than leave around the old
+ // allocation state.
+ cleanup();
+ return false;
+ }
+ // The reallocation succeeded, so install the buffer.
+ buf = new_buf;
+ size = new_size;
+ capacity = new_size;
+ return true;
+ }
+
+ void cleanup() {
+ if (buf) {
+ free(buf);
+ buf = nullptr;
+ size = 0;
+ capacity = 0;
+ }
+ }
+
+ ~Buffer() { cleanup(); }
+};
+
+struct Framebuffer {
+ GLuint color_attachment = 0;
+ GLint layer = 0;
+ GLuint depth_attachment = 0;
+};
+
+struct Renderbuffer {
+ GLuint texture = 0;
+
+ void on_erase();
+};
+
+TextureFilter gl_filter_to_texture_filter(int type) {
+ switch (type) {
+ case GL_NEAREST:
+ return TextureFilter::NEAREST;
+ case GL_NEAREST_MIPMAP_LINEAR:
+ return TextureFilter::NEAREST;
+ case GL_NEAREST_MIPMAP_NEAREST:
+ return TextureFilter::NEAREST;
+ case GL_LINEAR:
+ return TextureFilter::LINEAR;
+ case GL_LINEAR_MIPMAP_LINEAR:
+ return TextureFilter::LINEAR;
+ case GL_LINEAR_MIPMAP_NEAREST:
+ return TextureFilter::LINEAR;
+ default:
+ assert(0);
+ return TextureFilter::NEAREST;
+ }
+}
+
+// The SWGL depth buffer is roughly organized as a span buffer where each row
+// of the depth buffer is a list of spans, and each span has a constant depth
+// and a run length (represented by DepthRun). The span from start..start+count
+// is placed directly at that start index in the row's array of runs, so that
+// there is no need to explicitly record the start index at all. This also
+// avoids the need to move items around in the run array to manage insertions
+// since space is implicitly always available for a run between any two
+// pre-existing runs. Linkage from one run to the next is implicitly defined by
+// the count, so if a run exists from start..start+count, the next run will
+// implicitly pick up right at index start+count where that preceding run left
+// off. All of the DepthRun items that are after the head of the run can remain
+// uninitialized until the run needs to be split and a new run needs to start
+// somewhere in between.
+// For uses like perspective-correct rasterization or with a discard mask, a
+// run is not an efficient representation, and it is more beneficial to have
+// a flattened array of individual depth samples that can be masked off easily.
+// To support this case, the first run in a given row's run array may have a
+// zero count, signaling that this entire row is flattened. Critically, the
+// depth and count fields in DepthRun are ordered (endian-dependently) so that
+// the DepthRun struct can be interpreted as a sign-extended int32_t depth. It
+// is then possible to just treat the entire row as an array of int32_t depth
+// samples that can be processed with SIMD comparisons, since the count field
+// behaves as just the sign-extension of the depth field.
+// When a depth buffer is cleared, each row is initialized to a single run
+// spanning the entire row. In the normal case, the depth buffer will continue
+// to manage itself as a list of runs. If perspective or discard is used for
+// a given row, the row will be converted to the flattened representation to
+// support it, after which it will only ever revert back to runs if the depth
+// buffer is cleared.
+struct DepthRun {
+ // Ensure that depth always occupies the LSB and count the MSB so that we
+ // can sign-extend depth just by setting count to zero, marking it flat.
+ // When count is non-zero, then this is interpreted as an actual run and
+ // depth is read in isolation.
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ uint16_t depth;
+ uint16_t count;
+#else
+ uint16_t count;
+ uint16_t depth;
+#endif
+
+ DepthRun() = default;
+ DepthRun(uint16_t depth, uint16_t count) : depth(depth), count(count) {}
+
+ // If count is zero, this is actually a flat depth sample rather than a run.
+ bool is_flat() const { return !count; }
+
+ // Compare a source depth from rasterization with a stored depth value.
+ template <int FUNC>
+ ALWAYS_INLINE bool compare(uint16_t src) const {
+ switch (FUNC) {
+ case GL_LEQUAL:
+ return src <= depth;
+ case GL_LESS:
+ return src < depth;
+ case GL_ALWAYS:
+ return true;
+ default:
+ assert(false);
+ return false;
+ }
+ }
+};
+
+// A cursor for reading and modifying a row's depth run array. It locates
+// and iterates through a desired span within all the runs, testing if
+// the depth of this span passes or fails the depth test against existing
+// runs. If desired, new runs may be inserted to represent depth occlusion
+// from this span in the run array.
+struct DepthCursor {
+ // Current position of run the cursor has advanced to.
+ DepthRun* cur = nullptr;
+ // The start of the remaining potential samples in the desired span.
+ DepthRun* start = nullptr;
+ // The end of the potential samples in the desired span.
+ DepthRun* end = nullptr;
+
+ DepthCursor() = default;
+
+ // Construct a cursor with runs for a given row's run array and the bounds
+ // of the span we wish to iterate within it.
+ DepthCursor(DepthRun* runs, int num_runs, int span_offset, int span_count)
+ : cur(runs), start(&runs[span_offset]), end(start + span_count) {
+ // This cursor should never iterate over flat runs
+ assert(!runs->is_flat());
+ DepthRun* end_runs = &runs[num_runs];
+ // Clamp end of span to end of row
+ if (end > end_runs) {
+ end = end_runs;
+ }
+ // If the span starts past the end of the row, just advance immediately
+ // to it to signal that we're done.
+ if (start >= end_runs) {
+ cur = end_runs;
+ start = end_runs;
+ return;
+ }
+ // Otherwise, find the first depth run that contains the start of the span.
+ // If the span starts after the given run, then we need to keep searching
+ // through the row to find an appropriate run. The check above already
+ // guaranteed that the span starts within the row's runs, and the search
+ // won't fall off the end.
+ for (;;) {
+ assert(cur < end);
+ DepthRun* next = cur + cur->count;
+ if (start < next) {
+ break;
+ }
+ cur = next;
+ }
+ }
+
+ // The cursor is valid if the current position is at the end or if the run
+ // contains the start position.
+ bool valid() const {
+ return cur >= end || (cur <= start && start < cur + cur->count);
+ }
+
+ // Skip past any initial runs that fail the depth test. If we find a run that
+ // would pass, then return the accumulated length between where we started
+ // and that position. Otherwise, if we fall off the end, return -1 to signal
+ // that there are no more passed runs at the end of this failed region and
+ // so it is safe for the caller to stop processing any more regions in this
+ // row.
+ template <int FUNC>
+ int skip_failed(uint16_t val) {
+ assert(valid());
+ DepthRun* prev = start;
+ while (cur < end) {
+ if (cur->compare<FUNC>(val)) {
+ return start - prev;
+ }
+ cur += cur->count;
+ start = cur;
+ }
+ return -1;
+ }
+
+ // Helper to convert function parameters into template parameters to hoist
+ // some checks out of inner loops.
+ ALWAYS_INLINE int skip_failed(uint16_t val, GLenum func) {
+ switch (func) {
+ case GL_LEQUAL:
+ return skip_failed<GL_LEQUAL>(val);
+ case GL_LESS:
+ return skip_failed<GL_LESS>(val);
+ default:
+ assert(false);
+ return -1;
+ }
+ }
+
+ // Find a region of runs that passes the depth test. It is assumed the caller
+ // has called skip_failed first to skip past any runs that failed the depth
+ // test. This stops when it finds a run that fails the depth test or we fall
+ // off the end of the row. If the write mask is enabled, this will insert runs
+ // to represent this new region that passed the depth test. The length of the
+ // region is returned.
+ template <int FUNC, bool MASK>
+ int check_passed(uint16_t val) {
+ assert(valid());
+ DepthRun* prev = cur;
+ while (cur < end) {
+ if (!cur->compare<FUNC>(val)) {
+ break;
+ }
+ DepthRun* next = cur + cur->count;
+ if (next > end) {
+ if (MASK) {
+ // Chop the current run where the end of the span falls, making a new
+ // run from the end of the span till the next run. The beginning of
+ // the current run will be folded into the run from the start of the
+ // passed region before returning below.
+ *end = DepthRun(cur->depth, next - end);
+ }
+ // If the next run starts past the end, then just advance the current
+ // run to the end to signal that we're now at the end of the row.
+ next = end;
+ }
+ cur = next;
+ }
+ // If we haven't advanced past the start of the span region, then we found
+ // nothing that passed.
+ if (cur <= start) {
+ return 0;
+ }
+ // If 'end' fell within the middle of a passing run, then 'cur' will end up
+ // pointing at the new partial run created at 'end' where the passing run
+ // was split to accommodate starting in the middle. The preceding runs will
+ // be fixed below to properly join with this new split.
+ int passed = cur - start;
+ if (MASK) {
+ // If the search started from a run before the start of the span, then
+ // edit that run to meet up with the start.
+ if (prev < start) {
+ prev->count = start - prev;
+ }
+ // Create a new run for the entirety of the passed samples.
+ *start = DepthRun(val, passed);
+ }
+ start = cur;
+ return passed;
+ }
+
+ // Helper to convert function parameters into template parameters to hoist
+ // some checks out of inner loops.
+ template <bool MASK>
+ ALWAYS_INLINE int check_passed(uint16_t val, GLenum func) {
+ switch (func) {
+ case GL_LEQUAL:
+ return check_passed<GL_LEQUAL, MASK>(val);
+ case GL_LESS:
+ return check_passed<GL_LESS, MASK>(val);
+ default:
+ assert(false);
+ return 0;
+ }
+ }
+
+ ALWAYS_INLINE int check_passed(uint16_t val, GLenum func, bool mask) {
+ return mask ? check_passed<true>(val, func)
+ : check_passed<false>(val, func);
+ }
+
+ // Fill a region of runs with a given depth value, bypassing any depth test.
+ ALWAYS_INLINE void fill(uint16_t depth) {
+ check_passed<GL_ALWAYS, true>(depth);
+ }
+};
+
+struct Texture {
+ GLenum internal_format = 0;
+ int width = 0;
+ int height = 0;
+ int depth = 0;
+ char* buf = nullptr;
+ size_t buf_size = 0;
+ uint32_t buf_stride = 0;
+ uint8_t buf_bpp = 0;
+ GLenum min_filter = GL_NEAREST;
+ GLenum mag_filter = GL_LINEAR;
+ // The number of active locks on this texture. If this texture has any active
+ // locks, we need to disallow modifying or destroying the texture as it may
+ // be accessed by other threads where modifications could lead to races.
+ int32_t locked = 0;
+ // When used as an attachment of a framebuffer, rendering to the texture
+ // behaves as if it is located at the given offset such that the offset is
+ // subtracted from all transformed vertexes after the viewport is applied.
+ IntPoint offset;
+
+ enum FLAGS {
+ // If the buffer is internally-allocated by SWGL
+ SHOULD_FREE = 1 << 1,
+ // If the buffer has been cleared to initialize it. Currently this is only
+ // utilized by depth buffers which need to know when depth runs have reset
+ // to a valid row state. When unset, the depth runs may contain garbage.
+ CLEARED = 1 << 2,
+ };
+ int flags = SHOULD_FREE;
+ bool should_free() const { return bool(flags & SHOULD_FREE); }
+ bool cleared() const { return bool(flags & CLEARED); }
+
+ void set_flag(int flag, bool val) {
+ if (val) {
+ flags |= flag;
+ } else {
+ flags &= ~flag;
+ }
+ }
+ void set_should_free(bool val) {
+ // buf must be null before SHOULD_FREE can be safely toggled. Otherwise, we
+ // might accidentally mistakenly realloc an externally allocated buffer as
+ // if it were an internally allocated one.
+ assert(!buf);
+ set_flag(SHOULD_FREE, val);
+ }
+ void set_cleared(bool val) { set_flag(CLEARED, val); }
+
+ // Delayed-clearing state. When a clear of an FB is requested, we don't
+ // immediately clear each row, as the rows may be subsequently overwritten
+ // by draw calls, allowing us to skip the work of clearing the affected rows
+ // either fully or partially. Instead, we keep a bit vector of rows that need
+ // to be cleared later and save the value they need to be cleared with so
+ // that we can clear these rows individually when they are touched by draws.
+ // This currently only works for 2D textures, but not on texture arrays.
+ int delay_clear = 0;
+ uint32_t clear_val = 0;
+ uint32_t* cleared_rows = nullptr;
+
+ void init_depth_runs(uint16_t z);
+ void fill_depth_runs(uint16_t z);
+
+ void enable_delayed_clear(uint32_t val) {
+ delay_clear = height;
+ clear_val = val;
+ if (!cleared_rows) {
+ cleared_rows = new uint32_t[(height + 31) / 32];
+ }
+ memset(cleared_rows, 0, ((height + 31) / 32) * sizeof(uint32_t));
+ if (height & 31) {
+ cleared_rows[height / 32] = ~0U << (height & 31);
+ }
+ }
+
+ void disable_delayed_clear() {
+ if (cleared_rows) {
+ delete[] cleared_rows;
+ cleared_rows = nullptr;
+ delay_clear = 0;
+ }
+ }
+
+ int bpp() const { return buf_bpp; }
+ void set_bpp() { buf_bpp = bytes_for_internal_format(internal_format); }
+
+ size_t stride() const { return buf_stride; }
+ void set_stride() { buf_stride = aligned_stride(buf_bpp * width); }
+
+ // Set an external backing buffer of this texture.
+ void set_buffer(void* new_buf, size_t new_stride) {
+ assert(!should_free());
+ // Ensure that the supplied stride is at least as big as the internally
+ // calculated aligned stride.
+ set_bpp();
+ set_stride();
+ assert(new_stride >= buf_stride);
+
+ buf = (char*)new_buf;
+ buf_size = 0;
+ buf_stride = new_stride;
+ }
+
+ bool allocate(bool force = false, int min_width = 0, int min_height = 0) {
+ assert(!locked); // Locked textures shouldn't be reallocated
+ // If we get here, some GL API call that invalidates the texture was used.
+ // Mark the buffer as not-cleared to signal this.
+ set_cleared(false);
+ // Check if there is either no buffer currently or if we forced validation
+ // of the buffer size because some dimension might have changed.
+ if ((!buf || force) && should_free()) {
+ // Initialize the buffer's BPP and stride, since they may have changed.
+ set_bpp();
+ set_stride();
+ // Compute new size based on the maximum potential stride, rather than
+ // the current stride, to hopefully avoid reallocations when size would
+ // otherwise change too much...
+ size_t max_stride = max(buf_stride, aligned_stride(buf_bpp * min_width));
+ size_t size = max_stride * max(height, min_height) * max(depth, 1);
+ if ((!buf && size > 0) || size > buf_size) {
+ // Allocate with a SIMD register-sized tail of padding at the end so we
+ // can safely read or write past the end of the texture with SIMD ops.
+ // Currently only the flat Z-buffer texture needs this padding due to
+ // full-register loads and stores in check_depth and discard_depth. In
+ // case some code in the future accidentally uses a linear filter on a
+ // texture with less than 2 pixels per row, we also add this padding
+ // just to be safe. All other texture types and use-cases should be
+ // safe to omit padding.
+ size_t padding =
+ internal_format == GL_DEPTH_COMPONENT16 || max(width, min_width) < 2
+ ? sizeof(Float)
+ : 0;
+ char* new_buf = (char*)realloc(buf, size + padding);
+ assert(new_buf);
+ if (new_buf) {
+ // Successfully reallocated the buffer, so go ahead and set it.
+ buf = new_buf;
+ buf_size = size;
+ return true;
+ }
+ // Allocation failed, so ensure we don't leave stale buffer state.
+ cleanup();
+ }
+ }
+ // Nothing changed...
+ return false;
+ }
+
+ void cleanup() {
+ assert(!locked); // Locked textures shouldn't be destroyed
+ if (buf) {
+ // If we need to toggle SHOULD_FREE state, ensure that buf is nulled out,
+ // regardless of whether we internally allocated it. This will prevent us
+ // from wrongly treating buf as having been internally allocated for when
+ // we go to realloc if it actually was externally allocted.
+ if (should_free()) {
+ free(buf);
+ }
+ buf = nullptr;
+ buf_size = 0;
+ buf_bpp = 0;
+ buf_stride = 0;
+ }
+ disable_delayed_clear();
+ }
+
+ ~Texture() { cleanup(); }
+
+ IntRect bounds() const { return IntRect{0, 0, width, height}; }
+ IntRect offset_bounds() const { return bounds() + offset; }
+
+ // Find the valid sampling bounds relative to the requested region
+ IntRect sample_bounds(const IntRect& req, bool invertY = false) const {
+ IntRect bb = bounds().intersect(req) - req.origin();
+ if (invertY) bb.invert_y(req.height());
+ return bb;
+ }
+
+ // Get a pointer for sampling at the given offset
+ char* sample_ptr(int x, int y, int z = 0) const {
+ return buf + (height * z + y) * stride() + x * bpp();
+ }
+
+ // Get a pointer for sampling the requested region and limit to the provided
+ // sampling bounds
+ char* sample_ptr(const IntRect& req, const IntRect& bounds, int z,
+ bool invertY = false) const {
+ // Offset the sample pointer by the clamped bounds
+ int x = req.x0 + bounds.x0;
+ // Invert the Y offset if necessary
+ int y = invertY ? req.y1 - 1 - bounds.y0 : req.y0 + bounds.y0;
+ return sample_ptr(x, y, z);
+ }
+};
+
+// The last vertex attribute is reserved as a null attribute in case a vertex
+// attribute is used without being set.
+#define MAX_ATTRIBS 17
+#define NULL_ATTRIB 16
+struct VertexArray {
+ VertexAttrib attribs[MAX_ATTRIBS];
+ int max_attrib = -1;
+ // The GL spec defines element array buffer binding to be part of VAO state.
+ GLuint element_array_buffer_binding = 0;
+
+ void validate();
+};
+
+struct Shader {
+ GLenum type = 0;
+ ProgramLoader loader = nullptr;
+};
+
+struct Program {
+ ProgramImpl* impl = nullptr;
+ VertexShaderImpl* vert_impl = nullptr;
+ FragmentShaderImpl* frag_impl = nullptr;
+ bool deleted = false;
+
+ ~Program() { delete impl; }
+};
+
+// clang-format off
+// for GL defines to fully expand
+#define CONCAT_KEY(prefix, x, y, z, w, ...) prefix##x##y##z##w
+#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0)
+#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0)
+#define FOR_EACH_BLEND_KEY(macro) \
+ macro(GL_ONE, GL_ZERO, 0, 0) \
+ macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \
+ macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \
+ macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0) \
+ macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE) \
+ macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \
+ macro(GL_ZERO, GL_SRC_COLOR, 0, 0) \
+ macro(GL_ONE, GL_ONE, 0, 0) \
+ macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \
+ macro(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \
+ macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, 0, 0) \
+ macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0)
+
+#define DEFINE_BLEND_KEY(...) BLEND_KEY(__VA_ARGS__),
+#define DEFINE_MASK_BLEND_KEY(...) MASK_BLEND_KEY(__VA_ARGS__),
+enum BlendKey : uint8_t {
+ FOR_EACH_BLEND_KEY(DEFINE_BLEND_KEY)
+ FOR_EACH_BLEND_KEY(DEFINE_MASK_BLEND_KEY)
+ BLEND_KEY_NONE = BLEND_KEY(GL_ONE, GL_ZERO),
+ MASK_BLEND_KEY_NONE = MASK_BLEND_KEY(GL_ONE, GL_ZERO)
+};
+// clang-format on
+
+const size_t MAX_TEXTURE_UNITS = 16;
+
+template <typename T>
+static inline bool unlink(T& binding, T n) {
+ if (binding == n) {
+ binding = 0;
+ return true;
+ }
+ return false;
+}
+
+template <typename O>
+struct ObjectStore {
+ O** objects = nullptr;
+ size_t size = 0;
+ // reserve object 0 as null
+ size_t first_free = 1;
+ O invalid;
+
+ ~ObjectStore() {
+ if (objects) {
+ for (size_t i = 0; i < size; i++) delete objects[i];
+ free(objects);
+ }
+ }
+
+ bool grow(size_t i) {
+ size_t new_size = size ? size : 8;
+ while (new_size <= i) new_size += new_size / 2;
+ O** new_objects = (O**)realloc(objects, new_size * sizeof(O*));
+ assert(new_objects);
+ if (!new_objects) return false;
+ while (size < new_size) new_objects[size++] = nullptr;
+ objects = new_objects;
+ return true;
+ }
+
+ void insert(size_t i, const O& o) {
+ if (i >= size && !grow(i)) return;
+ if (!objects[i]) objects[i] = new O(o);
+ }
+
+ size_t next_free() {
+ size_t i = first_free;
+ while (i < size && objects[i]) i++;
+ first_free = i;
+ return i;
+ }
+
+ size_t insert(const O& o = O()) {
+ size_t i = next_free();
+ insert(i, o);
+ return i;
+ }
+
+ O& operator[](size_t i) {
+ insert(i, O());
+ return i < size ? *objects[i] : invalid;
+ }
+
+ O* find(size_t i) const { return i < size ? objects[i] : nullptr; }
+
+ template <typename T>
+ void on_erase(T*, ...) {}
+ template <typename T>
+ void on_erase(T* o, decltype(&T::on_erase)) {
+ o->on_erase();
+ }
+
+ bool erase(size_t i) {
+ if (i < size && objects[i]) {
+ on_erase(objects[i], nullptr);
+ delete objects[i];
+ objects[i] = nullptr;
+ if (i < first_free) first_free = i;
+ return true;
+ }
+ return false;
+ }
+
+ O** begin() const { return objects; }
+ O** end() const { return &objects[size]; }
+};
+
+struct Context {
+ int32_t references = 1;
+
+ ObjectStore<Query> queries;
+ ObjectStore<Buffer> buffers;
+ ObjectStore<Texture> textures;
+ ObjectStore<VertexArray> vertex_arrays;
+ ObjectStore<Framebuffer> framebuffers;
+ ObjectStore<Renderbuffer> renderbuffers;
+ ObjectStore<Shader> shaders;
+ ObjectStore<Program> programs;
+
+ IntRect viewport = {0, 0, 0, 0};
+
+ bool blend = false;
+ GLenum blendfunc_srgb = GL_ONE;
+ GLenum blendfunc_drgb = GL_ZERO;
+ GLenum blendfunc_sa = GL_ONE;
+ GLenum blendfunc_da = GL_ZERO;
+ GLenum blend_equation = GL_FUNC_ADD;
+ V8<uint16_t> blendcolor = 0;
+ BlendKey blend_key = BLEND_KEY_NONE;
+
+ bool depthtest = false;
+ bool depthmask = true;
+ GLenum depthfunc = GL_LESS;
+
+ bool scissortest = false;
+ IntRect scissor = {0, 0, 0, 0};
+
+ uint32_t clearcolor = 0;
+ GLdouble cleardepth = 1;
+
+ int unpack_row_length = 0;
+
+ int shaded_rows = 0;
+ int shaded_pixels = 0;
+
+ struct TextureUnit {
+ GLuint texture_2d_binding = 0;
+ GLuint texture_3d_binding = 0;
+ GLuint texture_2d_array_binding = 0;
+ GLuint texture_rectangle_binding = 0;
+
+ void unlink(GLuint n) {
+ ::unlink(texture_2d_binding, n);
+ ::unlink(texture_3d_binding, n);
+ ::unlink(texture_2d_array_binding, n);
+ ::unlink(texture_rectangle_binding, n);
+ }
+ };
+ TextureUnit texture_units[MAX_TEXTURE_UNITS];
+ int active_texture_unit = 0;
+
+ GLuint current_program = 0;
+
+ GLuint current_vertex_array = 0;
+ bool validate_vertex_array = true;
+
+ GLuint pixel_pack_buffer_binding = 0;
+ GLuint pixel_unpack_buffer_binding = 0;
+ GLuint array_buffer_binding = 0;
+ GLuint time_elapsed_query = 0;
+ GLuint samples_passed_query = 0;
+ GLuint renderbuffer_binding = 0;
+ GLuint draw_framebuffer_binding = 0;
+ GLuint read_framebuffer_binding = 0;
+ GLuint unknown_binding = 0;
+
+ GLuint& get_binding(GLenum name) {
+ switch (name) {
+ case GL_PIXEL_PACK_BUFFER:
+ return pixel_pack_buffer_binding;
+ case GL_PIXEL_UNPACK_BUFFER:
+ return pixel_unpack_buffer_binding;
+ case GL_ARRAY_BUFFER:
+ return array_buffer_binding;
+ case GL_ELEMENT_ARRAY_BUFFER:
+ return vertex_arrays[current_vertex_array].element_array_buffer_binding;
+ case GL_TEXTURE_2D:
+ return texture_units[active_texture_unit].texture_2d_binding;
+ case GL_TEXTURE_2D_ARRAY:
+ return texture_units[active_texture_unit].texture_2d_array_binding;
+ case GL_TEXTURE_3D:
+ return texture_units[active_texture_unit].texture_3d_binding;
+ case GL_TEXTURE_RECTANGLE:
+ return texture_units[active_texture_unit].texture_rectangle_binding;
+ case GL_TIME_ELAPSED:
+ return time_elapsed_query;
+ case GL_SAMPLES_PASSED:
+ return samples_passed_query;
+ case GL_RENDERBUFFER:
+ return renderbuffer_binding;
+ case GL_DRAW_FRAMEBUFFER:
+ return draw_framebuffer_binding;
+ case GL_READ_FRAMEBUFFER:
+ return read_framebuffer_binding;
+ default:
+ debugf("unknown binding %x\n", name);
+ assert(false);
+ return unknown_binding;
+ }
+ }
+
+ Texture& get_texture(sampler2D, int unit) {
+ return textures[texture_units[unit].texture_2d_binding];
+ }
+
+ Texture& get_texture(isampler2D, int unit) {
+ return textures[texture_units[unit].texture_2d_binding];
+ }
+
+ Texture& get_texture(sampler2DArray, int unit) {
+ return textures[texture_units[unit].texture_2d_array_binding];
+ }
+
+ Texture& get_texture(sampler2DRect, int unit) {
+ return textures[texture_units[unit].texture_rectangle_binding];
+ }
+
+ IntRect apply_scissor(IntRect bb,
+ const IntPoint& origin = IntPoint(0, 0)) const {
+ return scissortest ? bb.intersect(scissor - origin) : bb;
+ }
+
+ IntRect apply_scissor(const Texture& t) const {
+ return apply_scissor(t.bounds(), t.offset);
+ }
+};
+static Context* ctx = nullptr;
+static VertexShaderImpl* vertex_shader = nullptr;
+static FragmentShaderImpl* fragment_shader = nullptr;
+static BlendKey blend_key = BLEND_KEY_NONE;
+
+static void prepare_texture(Texture& t, const IntRect* skip = nullptr);
+
+template <typename S>
+static inline void init_depth(S* s, Texture& t) {
+ s->depth = max(t.depth, 1);
+ s->height_stride = s->stride * t.height;
+}
+
+template <typename S>
+static inline void init_filter(S* s, Texture& t) {
+ // If the width is not at least 2 pixels, then we can't safely sample the end
+ // of the row with a linear filter. In that case, just punt to using nearest
+ // filtering instead.
+ s->filter = t.width >= 2 ? gl_filter_to_texture_filter(t.mag_filter)
+ : TextureFilter::NEAREST;
+}
+
+template <typename S>
+static inline void init_sampler(S* s, Texture& t) {
+ prepare_texture(t);
+ s->width = t.width;
+ s->height = t.height;
+ s->stride = t.stride();
+ int bpp = t.bpp();
+ if (bpp >= 4)
+ s->stride /= 4;
+ else if (bpp == 2)
+ s->stride /= 2;
+ else
+ assert(bpp == 1);
+ // Use uint32_t* for easier sampling, but need to cast to uint8_t* or
+ // uint16_t* for formats with bpp < 4.
+ s->buf = (uint32_t*)t.buf;
+ s->format = gl_format_to_texture_format(t.internal_format);
+}
+
+template <typename S>
+static inline void null_sampler(S* s) {
+ // For null texture data, just make the sampler provide a 1x1 buffer that is
+ // transparent black. Ensure buffer holds at least a SIMD vector of zero data
+ // for SIMD padding of unaligned loads.
+ static const uint32_t zeroBuf[sizeof(Float) / sizeof(uint32_t)] = {0};
+ s->width = 1;
+ s->height = 1;
+ s->stride = s->width;
+ s->buf = (uint32_t*)zeroBuf;
+ s->format = TextureFormat::RGBA8;
+}
+
+template <typename S>
+static inline void null_filter(S* s) {
+ s->filter = TextureFilter::NEAREST;
+}
+
+template <typename S>
+static inline void null_depth(S* s) {
+ s->depth = 1;
+ s->height_stride = s->stride;
+}
+
+template <typename S>
+S* lookup_sampler(S* s, int texture) {
+ Texture& t = ctx->get_texture(s, texture);
+ if (!t.buf) {
+ null_sampler(s);
+ null_filter(s);
+ } else {
+ init_sampler(s, t);
+ init_filter(s, t);
+ }
+ return s;
+}
+
+template <typename S>
+S* lookup_isampler(S* s, int texture) {
+ Texture& t = ctx->get_texture(s, texture);
+ if (!t.buf) {
+ null_sampler(s);
+ } else {
+ init_sampler(s, t);
+ }
+ return s;
+}
+
+template <typename S>
+S* lookup_sampler_array(S* s, int texture) {
+ Texture& t = ctx->get_texture(s, texture);
+ if (!t.buf) {
+ null_sampler(s);
+ null_depth(s);
+ null_filter(s);
+ } else {
+ init_sampler(s, t);
+ init_depth(s, t);
+ init_filter(s, t);
+ }
+ return s;
+}
+
+int bytes_per_type(GLenum type) {
+ switch (type) {
+ case GL_INT:
+ return 4;
+ case GL_FLOAT:
+ return 4;
+ case GL_UNSIGNED_SHORT:
+ return 2;
+ case GL_UNSIGNED_BYTE:
+ return 1;
+ default:
+ assert(0);
+ return 0;
+ }
+}
+
+template <typename S, typename C>
+static inline S expand_attrib(const char* buf, size_t size, bool normalized) {
+ typedef typename ElementType<S>::ty elem_type;
+ S scalar = {0};
+ const C* src = reinterpret_cast<const C*>(buf);
+ if (normalized) {
+ const float scale = 1.0f / ((1 << (8 * sizeof(C))) - 1);
+ for (size_t i = 0; i < size / sizeof(C); i++) {
+ put_nth_component(scalar, i, elem_type(src[i]) * scale);
+ }
+ } else {
+ for (size_t i = 0; i < size / sizeof(C); i++) {
+ put_nth_component(scalar, i, elem_type(src[i]));
+ }
+ }
+ return scalar;
+}
+
+template <typename S>
+static inline S load_attrib_scalar(VertexAttrib& va, const char* src) {
+ if (sizeof(S) <= va.size) {
+ return *reinterpret_cast<const S*>(src);
+ }
+ if (va.type == GL_UNSIGNED_SHORT) {
+ return expand_attrib<S, uint16_t>(src, va.size, va.normalized);
+ }
+ if (va.type == GL_UNSIGNED_BYTE) {
+ return expand_attrib<S, uint8_t>(src, va.size, va.normalized);
+ }
+ assert(sizeof(typename ElementType<S>::ty) == bytes_per_type(va.type));
+ S scalar = {0};
+ memcpy(&scalar, src, va.size);
+ return scalar;
+}
+
+template <typename T>
+void load_attrib(T& attrib, VertexAttrib& va, uint32_t start, int instance,
+ int count) {
+ typedef decltype(force_scalar(attrib)) scalar_type;
+ if (!va.enabled) {
+ attrib = T(scalar_type{0});
+ } else if (va.divisor != 0) {
+ char* src = (char*)va.buf + va.stride * instance + va.offset;
+ assert(src + va.size <= va.buf + va.buf_size);
+ attrib = T(load_attrib_scalar<scalar_type>(va, src));
+ } else {
+ // Specialized for WR's primitive vertex order/winding.
+ if (!count) return;
+ assert(count >= 2 && count <= 4);
+ char* src = (char*)va.buf + va.stride * start + va.offset;
+ switch (count) {
+ case 2: {
+ // Lines must be indexed at offsets 0, 1.
+ // Line vertexes fill vertex shader SIMD lanes as 0, 1, 1, 0.
+ scalar_type lanes[2] = {
+ load_attrib_scalar<scalar_type>(va, src),
+ load_attrib_scalar<scalar_type>(va, src + va.stride)};
+ attrib = (T){lanes[0], lanes[1], lanes[1], lanes[0]};
+ break;
+ }
+ case 3: {
+ // Triangles must be indexed at offsets 0, 1, 2.
+ // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2.
+ scalar_type lanes[3] = {
+ load_attrib_scalar<scalar_type>(va, src),
+ load_attrib_scalar<scalar_type>(va, src + va.stride),
+ load_attrib_scalar<scalar_type>(va, src + va.stride * 2)};
+ attrib = (T){lanes[0], lanes[1], lanes[2], lanes[2]};
+ break;
+ }
+ default:
+ // Quads must be successive triangles indexed at offsets 0, 1, 2, 2,
+ // 1, 3. Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so
+ // that the points form a convex path that can be traversed by the
+ // rasterizer.
+ attrib = (T){load_attrib_scalar<scalar_type>(va, src),
+ load_attrib_scalar<scalar_type>(va, src + va.stride),
+ load_attrib_scalar<scalar_type>(va, src + va.stride * 3),
+ load_attrib_scalar<scalar_type>(va, src + va.stride * 2)};
+ break;
+ }
+ }
+}
+
+template <typename T>
+void load_flat_attrib(T& attrib, VertexAttrib& va, uint32_t start, int instance,
+ int count) {
+ typedef decltype(force_scalar(attrib)) scalar_type;
+ if (!va.enabled) {
+ attrib = T{0};
+ return;
+ }
+ char* src = nullptr;
+ if (va.divisor != 0) {
+ src = (char*)va.buf + va.stride * instance + va.offset;
+ } else {
+ if (!count) return;
+ src = (char*)va.buf + va.stride * start + va.offset;
+ }
+ assert(src + va.size <= va.buf + va.buf_size);
+ attrib = T(load_attrib_scalar<scalar_type>(va, src));
+}
+
+void setup_program(GLuint program) {
+ if (!program) {
+ vertex_shader = nullptr;
+ fragment_shader = nullptr;
+ return;
+ }
+ Program& p = ctx->programs[program];
+ assert(p.impl);
+ assert(p.vert_impl);
+ assert(p.frag_impl);
+ vertex_shader = p.vert_impl;
+ fragment_shader = p.frag_impl;
+}
+
+extern ProgramLoader load_shader(const char* name);
+
+extern "C" {
+
+void UseProgram(GLuint program) {
+ if (ctx->current_program && program != ctx->current_program) {
+ auto* p = ctx->programs.find(ctx->current_program);
+ if (p && p->deleted) {
+ ctx->programs.erase(ctx->current_program);
+ }
+ }
+ ctx->current_program = program;
+ setup_program(program);
+}
+
+void SetViewport(GLint x, GLint y, GLsizei width, GLsizei height) {
+ ctx->viewport = IntRect{x, y, x + width, y + height};
+}
+
+void Enable(GLenum cap) {
+ switch (cap) {
+ case GL_BLEND:
+ ctx->blend = true;
+ break;
+ case GL_DEPTH_TEST:
+ ctx->depthtest = true;
+ break;
+ case GL_SCISSOR_TEST:
+ ctx->scissortest = true;
+ break;
+ }
+}
+
+void Disable(GLenum cap) {
+ switch (cap) {
+ case GL_BLEND:
+ ctx->blend = false;
+ break;
+ case GL_DEPTH_TEST:
+ ctx->depthtest = false;
+ break;
+ case GL_SCISSOR_TEST:
+ ctx->scissortest = false;
+ break;
+ }
+}
+
+GLenum GetError() { return GL_NO_ERROR; }
+
+static const char* const extensions[] = {
+ "GL_ARB_blend_func_extended", "GL_ARB_copy_image",
+ "GL_ARB_draw_instanced", "GL_ARB_explicit_attrib_location",
+ "GL_ARB_instanced_arrays", "GL_ARB_invalidate_subdata",
+ "GL_ARB_texture_storage", "GL_EXT_timer_query",
+ "GL_APPLE_rgb_422",
+};
+
+void GetIntegerv(GLenum pname, GLint* params) {
+ assert(params);
+ switch (pname) {
+ case GL_MAX_TEXTURE_UNITS:
+ case GL_MAX_TEXTURE_IMAGE_UNITS:
+ params[0] = MAX_TEXTURE_UNITS;
+ break;
+ case GL_MAX_TEXTURE_SIZE:
+ params[0] = 1 << 15;
+ break;
+ case GL_MAX_ARRAY_TEXTURE_LAYERS:
+ params[0] = 1 << 15;
+ break;
+ case GL_READ_FRAMEBUFFER_BINDING:
+ params[0] = ctx->read_framebuffer_binding;
+ break;
+ case GL_DRAW_FRAMEBUFFER_BINDING:
+ params[0] = ctx->draw_framebuffer_binding;
+ break;
+ case GL_PIXEL_PACK_BUFFER_BINDING:
+ params[0] = ctx->pixel_pack_buffer_binding;
+ break;
+ case GL_PIXEL_UNPACK_BUFFER_BINDING:
+ params[0] = ctx->pixel_unpack_buffer_binding;
+ break;
+ case GL_NUM_EXTENSIONS:
+ params[0] = sizeof(extensions) / sizeof(extensions[0]);
+ break;
+ case GL_MAJOR_VERSION:
+ params[0] = 3;
+ break;
+ case GL_MINOR_VERSION:
+ params[0] = 2;
+ break;
+ default:
+ debugf("unhandled glGetIntegerv parameter %x\n", pname);
+ assert(false);
+ }
+}
+
+void GetBooleanv(GLenum pname, GLboolean* params) {
+ assert(params);
+ switch (pname) {
+ case GL_DEPTH_WRITEMASK:
+ params[0] = ctx->depthmask;
+ break;
+ default:
+ debugf("unhandled glGetBooleanv parameter %x\n", pname);
+ assert(false);
+ }
+}
+
+const char* GetString(GLenum name) {
+ switch (name) {
+ case GL_VENDOR:
+ return "Mozilla Gfx";
+ case GL_RENDERER:
+ return "Software WebRender";
+ case GL_VERSION:
+ return "3.2";
+ default:
+ debugf("unhandled glGetString parameter %x\n", name);
+ assert(false);
+ return nullptr;
+ }
+}
+
+const char* GetStringi(GLenum name, GLuint index) {
+ switch (name) {
+ case GL_EXTENSIONS:
+ if (index >= sizeof(extensions) / sizeof(extensions[0])) {
+ return nullptr;
+ }
+ return extensions[index];
+ default:
+ debugf("unhandled glGetStringi parameter %x\n", name);
+ assert(false);
+ return nullptr;
+ }
+}
+
+GLenum remap_blendfunc(GLenum rgb, GLenum a) {
+ switch (a) {
+ case GL_SRC_ALPHA:
+ if (rgb == GL_SRC_COLOR) a = GL_SRC_COLOR;
+ break;
+ case GL_ONE_MINUS_SRC_ALPHA:
+ if (rgb == GL_ONE_MINUS_SRC_COLOR) a = GL_ONE_MINUS_SRC_COLOR;
+ break;
+ case GL_DST_ALPHA:
+ if (rgb == GL_DST_COLOR) a = GL_DST_COLOR;
+ break;
+ case GL_ONE_MINUS_DST_ALPHA:
+ if (rgb == GL_ONE_MINUS_DST_COLOR) a = GL_ONE_MINUS_DST_COLOR;
+ break;
+ case GL_CONSTANT_ALPHA:
+ if (rgb == GL_CONSTANT_COLOR) a = GL_CONSTANT_COLOR;
+ break;
+ case GL_ONE_MINUS_CONSTANT_ALPHA:
+ if (rgb == GL_ONE_MINUS_CONSTANT_COLOR) a = GL_ONE_MINUS_CONSTANT_COLOR;
+ break;
+ case GL_SRC_COLOR:
+ if (rgb == GL_SRC_ALPHA) a = GL_SRC_ALPHA;
+ break;
+ case GL_ONE_MINUS_SRC_COLOR:
+ if (rgb == GL_ONE_MINUS_SRC_ALPHA) a = GL_ONE_MINUS_SRC_ALPHA;
+ break;
+ case GL_DST_COLOR:
+ if (rgb == GL_DST_ALPHA) a = GL_DST_ALPHA;
+ break;
+ case GL_ONE_MINUS_DST_COLOR:
+ if (rgb == GL_ONE_MINUS_DST_ALPHA) a = GL_ONE_MINUS_DST_ALPHA;
+ break;
+ case GL_CONSTANT_COLOR:
+ if (rgb == GL_CONSTANT_ALPHA) a = GL_CONSTANT_ALPHA;
+ break;
+ case GL_ONE_MINUS_CONSTANT_COLOR:
+ if (rgb == GL_ONE_MINUS_CONSTANT_ALPHA) a = GL_ONE_MINUS_CONSTANT_ALPHA;
+ break;
+ case GL_SRC1_ALPHA:
+ if (rgb == GL_SRC1_COLOR) a = GL_SRC1_COLOR;
+ break;
+ case GL_ONE_MINUS_SRC1_ALPHA:
+ if (rgb == GL_ONE_MINUS_SRC1_COLOR) a = GL_ONE_MINUS_SRC1_COLOR;
+ break;
+ case GL_SRC1_COLOR:
+ if (rgb == GL_SRC1_ALPHA) a = GL_SRC1_ALPHA;
+ break;
+ case GL_ONE_MINUS_SRC1_COLOR:
+ if (rgb == GL_ONE_MINUS_SRC1_ALPHA) a = GL_ONE_MINUS_SRC1_ALPHA;
+ break;
+ }
+ return a;
+}
+
+void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) {
+ ctx->blendfunc_srgb = srgb;
+ ctx->blendfunc_drgb = drgb;
+ sa = remap_blendfunc(srgb, sa);
+ da = remap_blendfunc(drgb, da);
+ ctx->blendfunc_sa = sa;
+ ctx->blendfunc_da = da;
+
+#define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20))
+ int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0);
+ if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da);
+ switch (hash) {
+#define MAP_BLEND_KEY(...) \
+ case HASH_BLEND_KEY(__VA_ARGS__): \
+ ctx->blend_key = BLEND_KEY(__VA_ARGS__); \
+ break;
+ FOR_EACH_BLEND_KEY(MAP_BLEND_KEY)
+ default:
+ debugf("blendfunc: %x, %x, separate: %x, %x\n", srgb, drgb, sa, da);
+ assert(false);
+ break;
+ }
+}
+
+void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
+ I32 c = round_pixel((Float){b, g, r, a});
+ ctx->blendcolor = CONVERT(c, U16).xyzwxyzw;
+}
+
+void BlendEquation(GLenum mode) {
+ assert(mode == GL_FUNC_ADD);
+ ctx->blend_equation = mode;
+}
+
+void DepthMask(GLboolean flag) { ctx->depthmask = flag; }
+
+void DepthFunc(GLenum func) {
+ switch (func) {
+ case GL_LESS:
+ case GL_LEQUAL:
+ break;
+ default:
+ assert(false);
+ }
+ ctx->depthfunc = func;
+}
+
+void SetScissor(GLint x, GLint y, GLsizei width, GLsizei height) {
+ ctx->scissor = IntRect{x, y, x + width, y + height};
+}
+
+void ClearColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
+ I32 c = round_pixel((Float){b, g, r, a});
+ ctx->clearcolor = bit_cast<uint32_t>(CONVERT(c, U8));
+}
+
+void ClearDepth(GLdouble depth) { ctx->cleardepth = depth; }
+
+void ActiveTexture(GLenum texture) {
+ assert(texture >= GL_TEXTURE0);
+ assert(texture < GL_TEXTURE0 + MAX_TEXTURE_UNITS);
+ ctx->active_texture_unit =
+ clamp(int(texture - GL_TEXTURE0), 0, int(MAX_TEXTURE_UNITS - 1));
+}
+
+void GenQueries(GLsizei n, GLuint* result) {
+ for (int i = 0; i < n; i++) {
+ Query q;
+ result[i] = ctx->queries.insert(q);
+ }
+}
+
+void DeleteQuery(GLuint n) {
+ if (n && ctx->queries.erase(n)) {
+ unlink(ctx->time_elapsed_query, n);
+ unlink(ctx->samples_passed_query, n);
+ }
+}
+
+void GenBuffers(int n, GLuint* result) {
+ for (int i = 0; i < n; i++) {
+ Buffer b;
+ result[i] = ctx->buffers.insert(b);
+ }
+}
+
+void DeleteBuffer(GLuint n) {
+ if (n && ctx->buffers.erase(n)) {
+ unlink(ctx->pixel_pack_buffer_binding, n);
+ unlink(ctx->pixel_unpack_buffer_binding, n);
+ unlink(ctx->array_buffer_binding, n);
+ }
+}
+
+void GenVertexArrays(int n, GLuint* result) {
+ for (int i = 0; i < n; i++) {
+ VertexArray v;
+ result[i] = ctx->vertex_arrays.insert(v);
+ }
+}
+
+void DeleteVertexArray(GLuint n) {
+ if (n && ctx->vertex_arrays.erase(n)) {
+ unlink(ctx->current_vertex_array, n);
+ }
+}
+
+GLuint CreateShader(GLenum type) {
+ Shader s;
+ s.type = type;
+ return ctx->shaders.insert(s);
+}
+
+void ShaderSourceByName(GLuint shader, char* name) {
+ Shader& s = ctx->shaders[shader];
+ s.loader = load_shader(name);
+ if (!s.loader) {
+ debugf("unknown shader %s\n", name);
+ }
+}
+
+void AttachShader(GLuint program, GLuint shader) {
+ Program& p = ctx->programs[program];
+ Shader& s = ctx->shaders[shader];
+ if (s.type == GL_VERTEX_SHADER) {
+ if (!p.impl && s.loader) p.impl = s.loader();
+ } else if (s.type == GL_FRAGMENT_SHADER) {
+ if (!p.impl && s.loader) p.impl = s.loader();
+ } else {
+ assert(0);
+ }
+}
+
+void DeleteShader(GLuint n) {
+ if (n) ctx->shaders.erase(n);
+}
+
+GLuint CreateProgram() {
+ Program p;
+ return ctx->programs.insert(p);
+}
+
+void DeleteProgram(GLuint n) {
+ if (!n) return;
+ if (ctx->current_program == n) {
+ if (auto* p = ctx->programs.find(n)) {
+ p->deleted = true;
+ }
+ } else {
+ ctx->programs.erase(n);
+ }
+}
+
+void LinkProgram(GLuint program) {
+ Program& p = ctx->programs[program];
+ assert(p.impl);
+ if (!p.impl) {
+ return;
+ }
+ assert(p.impl->interpolants_size() <= sizeof(Interpolants));
+ if (!p.vert_impl) p.vert_impl = p.impl->get_vertex_shader();
+ if (!p.frag_impl) p.frag_impl = p.impl->get_fragment_shader();
+}
+
+GLint GetLinkStatus(GLuint program) {
+ if (auto* p = ctx->programs.find(program)) {
+ return p->impl ? 1 : 0;
+ }
+ return 0;
+}
+
+void BindAttribLocation(GLuint program, GLuint index, char* name) {
+ Program& p = ctx->programs[program];
+ assert(p.impl);
+ if (!p.impl) {
+ return;
+ }
+ p.impl->bind_attrib(name, index);
+}
+
+GLint GetAttribLocation(GLuint program, char* name) {
+ Program& p = ctx->programs[program];
+ assert(p.impl);
+ if (!p.impl) {
+ return -1;
+ }
+ return p.impl->get_attrib(name);
+}
+
+GLint GetUniformLocation(GLuint program, char* name) {
+ Program& p = ctx->programs[program];
+ assert(p.impl);
+ if (!p.impl) {
+ return -1;
+ }
+ GLint loc = p.impl->get_uniform(name);
+ // debugf("location: %d\n", loc);
+ return loc;
+}
+
+static uint64_t get_time_value() {
+#ifdef __MACH__
+ return mach_absolute_time();
+#elif defined(_WIN32)
+ LARGE_INTEGER time;
+ static bool have_frequency = false;
+ static LARGE_INTEGER frequency;
+ if (!have_frequency) {
+ QueryPerformanceFrequency(&frequency);
+ have_frequency = true;
+ }
+ QueryPerformanceCounter(&time);
+ return time.QuadPart * 1000000000ULL / frequency.QuadPart;
+#else
+ return ({
+ struct timespec tp;
+ clock_gettime(CLOCK_MONOTONIC, &tp);
+ tp.tv_sec * 1000000000ULL + tp.tv_nsec;
+ });
+#endif
+}
+
+void BeginQuery(GLenum target, GLuint id) {
+ ctx->get_binding(target) = id;
+ Query& q = ctx->queries[id];
+ switch (target) {
+ case GL_SAMPLES_PASSED:
+ q.value = 0;
+ break;
+ case GL_TIME_ELAPSED:
+ q.value = get_time_value();
+ break;
+ default:
+ debugf("unknown query target %x for query %d\n", target, id);
+ assert(false);
+ }
+}
+
+void EndQuery(GLenum target) {
+ Query& q = ctx->queries[ctx->get_binding(target)];
+ switch (target) {
+ case GL_SAMPLES_PASSED:
+ break;
+ case GL_TIME_ELAPSED:
+ q.value = get_time_value() - q.value;
+ break;
+ default:
+ debugf("unknown query target %x\n", target);
+ assert(false);
+ }
+ ctx->get_binding(target) = 0;
+}
+
+void GetQueryObjectui64v(GLuint id, GLenum pname, GLuint64* params) {
+ Query& q = ctx->queries[id];
+ switch (pname) {
+ case GL_QUERY_RESULT:
+ assert(params);
+ params[0] = q.value;
+ break;
+ default:
+ assert(false);
+ }
+}
+
+void BindVertexArray(GLuint vertex_array) {
+ if (vertex_array != ctx->current_vertex_array) {
+ ctx->validate_vertex_array = true;
+ }
+ ctx->current_vertex_array = vertex_array;
+}
+
+void BindTexture(GLenum target, GLuint texture) {
+ ctx->get_binding(target) = texture;
+}
+
+void BindBuffer(GLenum target, GLuint buffer) {
+ ctx->get_binding(target) = buffer;
+}
+
+void BindFramebuffer(GLenum target, GLuint fb) {
+ if (target == GL_FRAMEBUFFER) {
+ ctx->read_framebuffer_binding = fb;
+ ctx->draw_framebuffer_binding = fb;
+ } else {
+ assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER);
+ ctx->get_binding(target) = fb;
+ }
+}
+
+void BindRenderbuffer(GLenum target, GLuint rb) {
+ ctx->get_binding(target) = rb;
+}
+
+void PixelStorei(GLenum name, GLint param) {
+ if (name == GL_UNPACK_ALIGNMENT) {
+ assert(param == 1);
+ } else if (name == GL_UNPACK_ROW_LENGTH) {
+ ctx->unpack_row_length = param;
+ }
+}
+
+static GLenum remap_internal_format(GLenum format) {
+ switch (format) {
+ case GL_DEPTH_COMPONENT:
+ return GL_DEPTH_COMPONENT16;
+ case GL_RGBA:
+ return GL_RGBA8;
+ case GL_RED:
+ return GL_R8;
+ case GL_RG:
+ return GL_RG8;
+ case GL_RGB_422_APPLE:
+ return GL_RGB_RAW_422_APPLE;
+ default:
+ return format;
+ }
+}
+
+void TexStorage3D(GLenum target, GLint levels, GLenum internal_format,
+ GLsizei width, GLsizei height, GLsizei depth) {
+ assert(levels == 1);
+ Texture& t = ctx->textures[ctx->get_binding(target)];
+ internal_format = remap_internal_format(internal_format);
+ bool changed = false;
+ if (t.width != width || t.height != height || t.depth != depth ||
+ t.internal_format != internal_format) {
+ changed = true;
+ t.internal_format = internal_format;
+ t.width = width;
+ t.height = height;
+ t.depth = depth;
+ }
+ t.disable_delayed_clear();
+ t.allocate(changed);
+}
+
+} // extern "C"
+
+static bool format_requires_conversion(GLenum external_format,
+ GLenum internal_format) {
+ switch (external_format) {
+ case GL_RGBA:
+ return internal_format == GL_RGBA8;
+ default:
+ return false;
+ }
+}
+
+static inline void copy_bgra8_to_rgba8(uint32_t* dest, const uint32_t* src,
+ int width) {
+ for (; width >= 4; width -= 4, dest += 4, src += 4) {
+ U32 p = unaligned_load<U32>(src);
+ U32 rb = p & 0x00FF00FF;
+ unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16));
+ }
+ for (; width > 0; width--, dest++, src++) {
+ uint32_t p = *src;
+ uint32_t rb = p & 0x00FF00FF;
+ *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16);
+ }
+}
+
+static void convert_copy(GLenum external_format, GLenum internal_format,
+ uint8_t* dst_buf, size_t dst_stride,
+ const uint8_t* src_buf, size_t src_stride,
+ size_t width, size_t height) {
+ switch (external_format) {
+ case GL_RGBA:
+ if (internal_format == GL_RGBA8) {
+ for (; height; height--) {
+ copy_bgra8_to_rgba8((uint32_t*)dst_buf, (const uint32_t*)src_buf,
+ width);
+ dst_buf += dst_stride;
+ src_buf += src_stride;
+ }
+ return;
+ }
+ break;
+ default:
+ break;
+ }
+ size_t row_bytes = width * bytes_for_internal_format(internal_format);
+ for (; height; height--) {
+ memcpy(dst_buf, src_buf, row_bytes);
+ dst_buf += dst_stride;
+ src_buf += src_stride;
+ }
+}
+
+static void set_tex_storage(Texture& t, GLenum external_format, GLsizei width,
+ GLsizei height, void* buf = nullptr,
+ GLsizei stride = 0, GLsizei min_width = 0,
+ GLsizei min_height = 0) {
+ GLenum internal_format = remap_internal_format(external_format);
+ bool changed = false;
+ if (t.width != width || t.height != height || t.depth != 0 ||
+ t.internal_format != internal_format) {
+ changed = true;
+ t.internal_format = internal_format;
+ t.width = width;
+ t.height = height;
+ t.depth = 0;
+ }
+ // If we are changed from an internally managed buffer to an externally
+ // supplied one or vice versa, ensure that we clean up old buffer state.
+ // However, if we have to convert the data from a non-native format, then
+ // always treat it as internally managed since we will need to copy to an
+ // internally managed native format buffer.
+ bool should_free = buf == nullptr || format_requires_conversion(
+ external_format, internal_format);
+ if (t.should_free() != should_free) {
+ changed = true;
+ t.cleanup();
+ t.set_should_free(should_free);
+ }
+ // If now an external buffer, explicitly set it...
+ if (!should_free) {
+ t.set_buffer(buf, stride);
+ }
+ t.disable_delayed_clear();
+ t.allocate(changed, min_width, min_height);
+ // If we have a buffer that needs format conversion, then do that now.
+ if (buf && should_free) {
+ convert_copy(external_format, internal_format, (uint8_t*)t.buf, t.stride(),
+ (const uint8_t*)buf, stride, width, height);
+ }
+}
+
+extern "C" {
+
+void TexStorage2D(GLenum target, GLint levels, GLenum internal_format,
+ GLsizei width, GLsizei height) {
+ assert(levels == 1);
+ Texture& t = ctx->textures[ctx->get_binding(target)];
+ set_tex_storage(t, internal_format, width, height);
+}
+
+GLenum internal_format_for_data(GLenum format, GLenum ty) {
+ if (format == GL_RED && ty == GL_UNSIGNED_BYTE) {
+ return GL_R8;
+ } else if ((format == GL_RGBA || format == GL_BGRA) &&
+ (ty == GL_UNSIGNED_BYTE || ty == GL_UNSIGNED_INT_8_8_8_8_REV)) {
+ return GL_RGBA8;
+ } else if (format == GL_RGBA && ty == GL_FLOAT) {
+ return GL_RGBA32F;
+ } else if (format == GL_RGBA_INTEGER && ty == GL_INT) {
+ return GL_RGBA32I;
+ } else if (format == GL_RG && ty == GL_UNSIGNED_BYTE) {
+ return GL_RG8;
+ } else if (format == GL_RGB_422_APPLE &&
+ ty == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
+ return GL_RGB_RAW_422_APPLE;
+ } else if (format == GL_RED && ty == GL_UNSIGNED_SHORT) {
+ return GL_R16;
+ } else {
+ debugf("unknown internal format for format %x, type %x\n", format, ty);
+ assert(false);
+ return 0;
+ }
+}
+
+static Buffer* get_pixel_pack_buffer() {
+ return ctx->pixel_pack_buffer_binding
+ ? &ctx->buffers[ctx->pixel_pack_buffer_binding]
+ : nullptr;
+}
+
+static void* get_pixel_pack_buffer_data(void* data) {
+ if (Buffer* b = get_pixel_pack_buffer()) {
+ return b->buf ? b->buf + (size_t)data : nullptr;
+ }
+ return data;
+}
+
+static Buffer* get_pixel_unpack_buffer() {
+ return ctx->pixel_unpack_buffer_binding
+ ? &ctx->buffers[ctx->pixel_unpack_buffer_binding]
+ : nullptr;
+}
+
+static void* get_pixel_unpack_buffer_data(void* data) {
+ if (Buffer* b = get_pixel_unpack_buffer()) {
+ return b->buf ? b->buf + (size_t)data : nullptr;
+ }
+ return data;
+}
+
+void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
+ GLsizei width, GLsizei height, GLenum format, GLenum ty,
+ void* data) {
+ if (level != 0) {
+ assert(false);
+ return;
+ }
+ data = get_pixel_unpack_buffer_data(data);
+ if (!data) return;
+ Texture& t = ctx->textures[ctx->get_binding(target)];
+ IntRect skip = {xoffset, yoffset, xoffset + width, yoffset + height};
+ prepare_texture(t, &skip);
+ assert(xoffset + width <= t.width);
+ assert(yoffset + height <= t.height);
+ assert(ctx->unpack_row_length == 0 || ctx->unpack_row_length >= width);
+ GLsizei row_length =
+ ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width;
+ assert(t.internal_format == internal_format_for_data(format, ty));
+ int src_bpp = format_requires_conversion(format, t.internal_format)
+ ? bytes_for_internal_format(format)
+ : t.bpp();
+ if (!src_bpp || !t.buf) return;
+ convert_copy(format, t.internal_format,
+ (uint8_t*)t.sample_ptr(xoffset, yoffset), t.stride(),
+ (const uint8_t*)data, row_length * src_bpp, width, height);
+}
+
+void TexImage2D(GLenum target, GLint level, GLint internal_format,
+ GLsizei width, GLsizei height, GLint border, GLenum format,
+ GLenum ty, void* data) {
+ if (level != 0) {
+ assert(false);
+ return;
+ }
+ assert(border == 0);
+ TexStorage2D(target, 1, internal_format, width, height);
+ TexSubImage2D(target, 0, 0, 0, width, height, format, ty, data);
+}
+
+void TexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
+ GLint zoffset, GLsizei width, GLsizei height, GLsizei depth,
+ GLenum format, GLenum ty, void* data) {
+ if (level != 0) {
+ assert(false);
+ return;
+ }
+ data = get_pixel_unpack_buffer_data(data);
+ if (!data) return;
+ Texture& t = ctx->textures[ctx->get_binding(target)];
+ prepare_texture(t);
+ assert(ctx->unpack_row_length == 0 || ctx->unpack_row_length >= width);
+ GLsizei row_length =
+ ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width;
+ assert(t.internal_format == internal_format_for_data(format, ty));
+ int src_bpp = format_requires_conversion(format, t.internal_format)
+ ? bytes_for_internal_format(format)
+ : t.bpp();
+ if (!src_bpp || !t.buf) return;
+ const uint8_t* src = (const uint8_t*)data;
+ assert(xoffset + width <= t.width);
+ assert(yoffset + height <= t.height);
+ assert(zoffset + depth <= t.depth);
+ size_t dest_stride = t.stride();
+ size_t src_stride = row_length * src_bpp;
+ for (int z = 0; z < depth; z++) {
+ convert_copy(format, t.internal_format,
+ (uint8_t*)t.sample_ptr(xoffset, yoffset, zoffset + z),
+ dest_stride, src, src_stride, width, height);
+ src += src_stride * height;
+ }
+}
+
+void TexImage3D(GLenum target, GLint level, GLint internal_format,
+ GLsizei width, GLsizei height, GLsizei depth, GLint border,
+ GLenum format, GLenum ty, void* data) {
+ if (level != 0) {
+ assert(false);
+ return;
+ }
+ assert(border == 0);
+ TexStorage3D(target, 1, internal_format, width, height, depth);
+ TexSubImage3D(target, 0, 0, 0, 0, width, height, depth, format, ty, data);
+}
+
+void GenerateMipmap(UNUSED GLenum target) {
+ // TODO: support mipmaps
+}
+
+void SetTextureParameter(GLuint texid, GLenum pname, GLint param) {
+ Texture& t = ctx->textures[texid];
+ switch (pname) {
+ case GL_TEXTURE_WRAP_S:
+ assert(param == GL_CLAMP_TO_EDGE);
+ break;
+ case GL_TEXTURE_WRAP_T:
+ assert(param == GL_CLAMP_TO_EDGE);
+ break;
+ case GL_TEXTURE_MIN_FILTER:
+ t.min_filter = param;
+ break;
+ case GL_TEXTURE_MAG_FILTER:
+ t.mag_filter = param;
+ break;
+ default:
+ break;
+ }
+}
+
+void TexParameteri(GLenum target, GLenum pname, GLint param) {
+ SetTextureParameter(ctx->get_binding(target), pname, param);
+}
+
+void GenTextures(int n, GLuint* result) {
+ for (int i = 0; i < n; i++) {
+ Texture t;
+ result[i] = ctx->textures.insert(t);
+ }
+}
+
+void DeleteTexture(GLuint n) {
+ if (n && ctx->textures.erase(n)) {
+ for (size_t i = 0; i < MAX_TEXTURE_UNITS; i++) {
+ ctx->texture_units[i].unlink(n);
+ }
+ }
+}
+
+void GenRenderbuffers(int n, GLuint* result) {
+ for (int i = 0; i < n; i++) {
+ Renderbuffer r;
+ result[i] = ctx->renderbuffers.insert(r);
+ }
+}
+
+void Renderbuffer::on_erase() {
+ for (auto* fb : ctx->framebuffers) {
+ if (fb) {
+ if (unlink(fb->color_attachment, texture)) {
+ fb->layer = 0;
+ }
+ unlink(fb->depth_attachment, texture);
+ }
+ }
+ DeleteTexture(texture);
+}
+
+void DeleteRenderbuffer(GLuint n) {
+ if (n && ctx->renderbuffers.erase(n)) {
+ unlink(ctx->renderbuffer_binding, n);
+ }
+}
+
+void GenFramebuffers(int n, GLuint* result) {
+ for (int i = 0; i < n; i++) {
+ Framebuffer f;
+ result[i] = ctx->framebuffers.insert(f);
+ }
+}
+
+void DeleteFramebuffer(GLuint n) {
+ if (n && ctx->framebuffers.erase(n)) {
+ unlink(ctx->read_framebuffer_binding, n);
+ unlink(ctx->draw_framebuffer_binding, n);
+ }
+}
+
+void RenderbufferStorage(GLenum target, GLenum internal_format, GLsizei width,
+ GLsizei height) {
+ // Just refer a renderbuffer to a texture to simplify things for now...
+ Renderbuffer& r = ctx->renderbuffers[ctx->get_binding(target)];
+ if (!r.texture) {
+ GenTextures(1, &r.texture);
+ }
+ switch (internal_format) {
+ case GL_DEPTH_COMPONENT:
+ case GL_DEPTH_COMPONENT24:
+ case GL_DEPTH_COMPONENT32:
+ // Force depth format to 16 bits...
+ internal_format = GL_DEPTH_COMPONENT16;
+ break;
+ }
+ set_tex_storage(ctx->textures[r.texture], internal_format, width, height);
+}
+
+void VertexAttribPointer(GLuint index, GLint size, GLenum type, bool normalized,
+ GLsizei stride, GLuint offset) {
+ // debugf("cva: %d\n", ctx->current_vertex_array);
+ VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array];
+ if (index >= NULL_ATTRIB) {
+ assert(0);
+ return;
+ }
+ VertexAttrib& va = v.attribs[index];
+ va.size = size * bytes_per_type(type);
+ va.type = type;
+ va.normalized = normalized;
+ va.stride = stride;
+ va.offset = offset;
+ // Buffer &vertex_buf = ctx->buffers[ctx->array_buffer_binding];
+ va.vertex_buffer = ctx->array_buffer_binding;
+ va.vertex_array = ctx->current_vertex_array;
+ ctx->validate_vertex_array = true;
+}
+
+void VertexAttribIPointer(GLuint index, GLint size, GLenum type, GLsizei stride,
+ GLuint offset) {
+ // debugf("cva: %d\n", ctx->current_vertex_array);
+ VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array];
+ if (index >= NULL_ATTRIB) {
+ assert(0);
+ return;
+ }
+ VertexAttrib& va = v.attribs[index];
+ va.size = size * bytes_per_type(type);
+ va.type = type;
+ va.normalized = false;
+ va.stride = stride;
+ va.offset = offset;
+ // Buffer &vertex_buf = ctx->buffers[ctx->array_buffer_binding];
+ va.vertex_buffer = ctx->array_buffer_binding;
+ va.vertex_array = ctx->current_vertex_array;
+ ctx->validate_vertex_array = true;
+}
+
+void EnableVertexAttribArray(GLuint index) {
+ VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array];
+ if (index >= NULL_ATTRIB) {
+ assert(0);
+ return;
+ }
+ VertexAttrib& va = v.attribs[index];
+ if (!va.enabled) {
+ ctx->validate_vertex_array = true;
+ }
+ va.enabled = true;
+ v.max_attrib = max(v.max_attrib, (int)index);
+}
+
+void DisableVertexAttribArray(GLuint index) {
+ VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array];
+ if (index >= NULL_ATTRIB) {
+ assert(0);
+ return;
+ }
+ VertexAttrib& va = v.attribs[index];
+ if (va.enabled) {
+ ctx->validate_vertex_array = true;
+ }
+ va.enabled = false;
+}
+
+void VertexAttribDivisor(GLuint index, GLuint divisor) {
+ VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array];
+ // Only support divisor being 0 (per-vertex) or 1 (per-instance).
+ if (index >= NULL_ATTRIB || divisor > 1) {
+ assert(0);
+ return;
+ }
+ VertexAttrib& va = v.attribs[index];
+ va.divisor = divisor;
+}
+
+void BufferData(GLenum target, GLsizeiptr size, void* data,
+ UNUSED GLenum usage) {
+ Buffer& b = ctx->buffers[ctx->get_binding(target)];
+ if (b.allocate(size)) {
+ ctx->validate_vertex_array = true;
+ }
+ if (data && b.buf && size <= b.size) {
+ memcpy(b.buf, data, size);
+ }
+}
+
+void BufferSubData(GLenum target, GLintptr offset, GLsizeiptr size,
+ void* data) {
+ Buffer& b = ctx->buffers[ctx->get_binding(target)];
+ assert(offset + size <= b.size);
+ if (data && b.buf && offset + size <= b.size) {
+ memcpy(&b.buf[offset], data, size);
+ }
+}
+
+void* MapBuffer(GLenum target, UNUSED GLbitfield access) {
+ Buffer& b = ctx->buffers[ctx->get_binding(target)];
+ return b.buf;
+}
+
+void* MapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length,
+ UNUSED GLbitfield access) {
+ Buffer& b = ctx->buffers[ctx->get_binding(target)];
+ if (b.buf && offset >= 0 && length > 0 && offset + length <= b.size) {
+ return b.buf + offset;
+ }
+ return nullptr;
+}
+
+GLboolean UnmapBuffer(GLenum target) {
+ Buffer& b = ctx->buffers[ctx->get_binding(target)];
+ return b.buf != nullptr;
+}
+
+void Uniform1i(GLint location, GLint V0) {
+ // debugf("tex: %d\n", (int)ctx->textures.size);
+ if (vertex_shader) {
+ vertex_shader->set_uniform_1i(location, V0);
+ }
+}
+void Uniform4fv(GLint location, GLsizei count, const GLfloat* v) {
+ assert(count == 1);
+ if (vertex_shader) {
+ vertex_shader->set_uniform_4fv(location, v);
+ }
+}
+void UniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose,
+ const GLfloat* value) {
+ assert(count == 1);
+ assert(!transpose);
+ if (vertex_shader) {
+ vertex_shader->set_uniform_matrix4fv(location, value);
+ }
+}
+
+void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget,
+ GLuint texture, GLint level) {
+ assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER);
+ assert(textarget == GL_TEXTURE_2D || textarget == GL_TEXTURE_RECTANGLE);
+ assert(level == 0);
+ Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)];
+ if (attachment == GL_COLOR_ATTACHMENT0) {
+ fb.color_attachment = texture;
+ fb.layer = 0;
+ } else if (attachment == GL_DEPTH_ATTACHMENT) {
+ fb.depth_attachment = texture;
+ } else {
+ assert(0);
+ }
+}
+
+void FramebufferTextureLayer(GLenum target, GLenum attachment, GLuint texture,
+ GLint level, GLint layer) {
+ assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER);
+ assert(level == 0);
+ Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)];
+ if (attachment == GL_COLOR_ATTACHMENT0) {
+ fb.color_attachment = texture;
+ fb.layer = layer;
+ } else if (attachment == GL_DEPTH_ATTACHMENT) {
+ assert(layer == 0);
+ fb.depth_attachment = texture;
+ } else {
+ assert(0);
+ }
+}
+
+void FramebufferRenderbuffer(GLenum target, GLenum attachment,
+ GLenum renderbuffertarget, GLuint renderbuffer) {
+ assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER);
+ assert(renderbuffertarget == GL_RENDERBUFFER);
+ Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)];
+ Renderbuffer& rb = ctx->renderbuffers[renderbuffer];
+ if (attachment == GL_COLOR_ATTACHMENT0) {
+ fb.color_attachment = rb.texture;
+ fb.layer = 0;
+ } else if (attachment == GL_DEPTH_ATTACHMENT) {
+ fb.depth_attachment = rb.texture;
+ } else {
+ assert(0);
+ }
+}
+
+} // extern "C"
+
+static inline Framebuffer* get_framebuffer(GLenum target) {
+ if (target == GL_FRAMEBUFFER) {
+ target = GL_DRAW_FRAMEBUFFER;
+ }
+ return ctx->framebuffers.find(ctx->get_binding(target));
+}
+
+template <typename T>
+static inline void fill_n(T* dst, size_t n, T val) {
+ for (T* end = &dst[n]; dst < end; dst++) *dst = val;
+}
+
+#if USE_SSE2
+template <>
+inline void fill_n<uint32_t>(uint32_t* dst, size_t n, uint32_t val) {
+ __asm__ __volatile__("rep stosl\n"
+ : "+D"(dst), "+c"(n)
+ : "a"(val)
+ : "memory", "cc");
+}
+#endif
+
+static inline uint32_t clear_chunk(uint8_t value) {
+ return uint32_t(value) * 0x01010101U;
+}
+
+static inline uint32_t clear_chunk(uint16_t value) {
+ return uint32_t(value) | (uint32_t(value) << 16);
+}
+
+static inline uint32_t clear_chunk(uint32_t value) { return value; }
+
+template <typename T>
+static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) {
+ const size_t N = sizeof(uint32_t) / sizeof(T);
+ // fill any leading unaligned values
+ if (N > 1) {
+ size_t align = (-(intptr_t)buf & (sizeof(uint32_t) - 1)) / sizeof(T);
+ if (align <= len) {
+ fill_n(buf, align, value);
+ len -= align;
+ buf += align;
+ }
+ }
+ // fill as many aligned chunks as possible
+ fill_n((uint32_t*)buf, len / N, chunk);
+ // fill any remaining values
+ if (N > 1) {
+ fill_n(buf + (len & ~(N - 1)), len & (N - 1), value);
+ }
+}
+
+template <typename T>
+static void clear_buffer(Texture& t, T value, int layer, IntRect bb,
+ int skip_start = 0, int skip_end = 0) {
+ if (!t.buf) return;
+ skip_start = max(skip_start, bb.x0);
+ skip_end = max(skip_end, skip_start);
+ assert(sizeof(T) == t.bpp());
+ size_t stride = t.stride();
+ // When clearing multiple full-width rows, collapse them into a single
+ // large "row" to avoid redundant setup from clearing each row individually.
+ if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end) {
+ bb.x1 += (stride / sizeof(T)) * (bb.height() - 1);
+ bb.y1 = bb.y0 + 1;
+ }
+ T* buf = (T*)t.sample_ptr(bb.x0, bb.y0, layer);
+ uint32_t chunk = clear_chunk(value);
+ for (int rows = bb.height(); rows > 0; rows--) {
+ if (bb.x0 < skip_start) {
+ clear_row(buf, skip_start - bb.x0, value, chunk);
+ }
+ if (skip_end < bb.x1) {
+ clear_row(buf + (skip_end - bb.x0), bb.x1 - skip_end, value, chunk);
+ }
+ buf += stride / sizeof(T);
+ }
+}
+
+template <typename T>
+static inline void clear_buffer(Texture& t, T value, int layer = 0) {
+ IntRect bb = ctx->apply_scissor(t);
+ if (bb.width() > 0) {
+ clear_buffer<T>(t, value, layer, bb);
+ }
+}
+
+template <typename T>
+static inline void force_clear_row(Texture& t, int y, int skip_start = 0,
+ int skip_end = 0) {
+ assert(t.buf != nullptr);
+ assert(sizeof(T) == t.bpp());
+ assert(skip_start <= skip_end);
+ T* buf = (T*)t.sample_ptr(0, y);
+ uint32_t chunk = clear_chunk((T)t.clear_val);
+ if (skip_start > 0) {
+ clear_row<T>(buf, skip_start, t.clear_val, chunk);
+ }
+ if (skip_end < t.width) {
+ clear_row<T>(buf + skip_end, t.width - skip_end, t.clear_val, chunk);
+ }
+}
+
+template <typename T>
+static void force_clear(Texture& t, const IntRect* skip = nullptr) {
+ if (!t.delay_clear || !t.cleared_rows) {
+ return;
+ }
+ int y0 = 0;
+ int y1 = t.height;
+ int skip_start = 0;
+ int skip_end = 0;
+ if (skip) {
+ y0 = clamp(skip->y0, 0, t.height);
+ y1 = clamp(skip->y1, y0, t.height);
+ skip_start = clamp(skip->x0, 0, t.width);
+ skip_end = clamp(skip->x1, skip_start, t.width);
+ if (skip_start <= 0 && skip_end >= t.width && y0 <= 0 && y1 >= t.height) {
+ t.disable_delayed_clear();
+ return;
+ }
+ }
+ int num_masks = (y1 + 31) / 32;
+ uint32_t* rows = t.cleared_rows;
+ for (int i = y0 / 32; i < num_masks; i++) {
+ uint32_t mask = rows[i];
+ if (mask != ~0U) {
+ rows[i] = ~0U;
+ int start = i * 32;
+ while (mask) {
+ int count = __builtin_ctz(mask);
+ if (count > 0) {
+ clear_buffer<T>(t, t.clear_val, 0,
+ IntRect{0, start, t.width, start + count}, skip_start,
+ skip_end);
+ t.delay_clear -= count;
+ start += count;
+ mask >>= count;
+ }
+ count = __builtin_ctz(mask + 1);
+ start += count;
+ mask >>= count;
+ }
+ int count = (i + 1) * 32 - start;
+ if (count > 0) {
+ clear_buffer<T>(t, t.clear_val, 0,
+ IntRect{0, start, t.width, start + count}, skip_start,
+ skip_end);
+ t.delay_clear -= count;
+ }
+ }
+ }
+ if (t.delay_clear <= 0) t.disable_delayed_clear();
+}
+
+static void prepare_texture(Texture& t, const IntRect* skip) {
+ if (t.delay_clear) {
+ switch (t.internal_format) {
+ case GL_RGBA8:
+ force_clear<uint32_t>(t, skip);
+ break;
+ case GL_R8:
+ force_clear<uint8_t>(t, skip);
+ break;
+ case GL_RG8:
+ force_clear<uint16_t>(t, skip);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ }
+}
+
+static inline bool clear_requires_scissor(Texture& t) {
+ return ctx->scissortest && !ctx->scissor.contains(t.offset_bounds());
+}
+
+// Setup a clear on a texture. This may either force an immediate clear or
+// potentially punt to a delayed clear, if applicable.
+template <typename T>
+static void request_clear(Texture& t, int layer, T value) {
+ // If the clear would require a scissor, force clear anything outside
+ // the scissor, and then immediately clear anything inside the scissor.
+ if (clear_requires_scissor(t)) {
+ IntRect skip = ctx->scissor - t.offset;
+ force_clear<T>(t, &skip);
+ clear_buffer<T>(t, value, layer);
+ } else if (t.depth > 1) {
+ // Delayed clear is not supported on texture arrays.
+ t.disable_delayed_clear();
+ clear_buffer<T>(t, value, layer);
+ } else {
+ // Do delayed clear for 2D texture without scissor.
+ t.enable_delayed_clear(value);
+ }
+}
+
+// Initialize a depth texture by setting the first run in each row to encompass
+// the entire row.
+void Texture::init_depth_runs(uint16_t depth) {
+ if (!buf) return;
+ DepthRun* runs = (DepthRun*)buf;
+ for (int y = 0; y < height; y++) {
+ runs[0] = DepthRun(depth, width);
+ runs += stride() / sizeof(DepthRun);
+ }
+ set_cleared(true);
+}
+
+// Fill a portion of the run array with flattened depth samples.
+static ALWAYS_INLINE void fill_depth_run(DepthRun* dst, size_t n,
+ uint16_t depth) {
+ fill_n((uint32_t*)dst, n, uint32_t(depth));
+}
+
+// Fills a scissored region of a depth texture with a given depth.
+void Texture::fill_depth_runs(uint16_t depth) {
+ if (!buf) return;
+ assert(cleared());
+ IntRect bb = ctx->apply_scissor(*this);
+ DepthRun* runs = (DepthRun*)sample_ptr(0, bb.y0);
+ for (int rows = bb.height(); rows > 0; rows--) {
+ if (bb.width() >= width) {
+ // If the scissor region encompasses the entire row, reset the row to a
+ // single run encompassing the entire row.
+ runs[0] = DepthRun(depth, width);
+ } else if (runs->is_flat()) {
+ // If the row is flattened, just directly fill the portion of the row.
+ fill_depth_run(&runs[bb.x0], bb.width(), depth);
+ } else {
+ // Otherwise, if we are still using runs, then set up a cursor to fill
+ // it with depth runs.
+ DepthCursor(runs, width, bb.x0, bb.width()).fill(depth);
+ }
+ runs += stride() / sizeof(DepthRun);
+ }
+}
+
+extern "C" {
+
+void InitDefaultFramebuffer(int x, int y, int width, int height, int stride,
+ void* buf) {
+ Framebuffer& fb = ctx->framebuffers[0];
+ if (!fb.color_attachment) {
+ GenTextures(1, &fb.color_attachment);
+ fb.layer = 0;
+ }
+ // If the dimensions or buffer properties changed, we need to reallocate
+ // the underlying storage for the color buffer texture.
+ Texture& colortex = ctx->textures[fb.color_attachment];
+ set_tex_storage(colortex, GL_RGBA8, width, height, buf, stride);
+ colortex.offset = IntPoint(x, y);
+ if (!fb.depth_attachment) {
+ GenTextures(1, &fb.depth_attachment);
+ }
+ // Ensure dimensions of the depth buffer match the color buffer.
+ Texture& depthtex = ctx->textures[fb.depth_attachment];
+ set_tex_storage(depthtex, GL_DEPTH_COMPONENT16, width, height);
+ depthtex.offset = IntPoint(x, y);
+}
+
+void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width,
+ int32_t* height, int32_t* stride) {
+ Framebuffer* fb = ctx->framebuffers.find(fbo);
+ if (!fb || !fb->color_attachment) {
+ return nullptr;
+ }
+ Texture& colortex = ctx->textures[fb->color_attachment];
+ if (flush) {
+ prepare_texture(colortex);
+ }
+ assert(colortex.offset == IntPoint(0, 0));
+ *width = colortex.width;
+ *height = colortex.height;
+ *stride = colortex.stride();
+ return colortex.buf ? colortex.sample_ptr(0, 0, fb->layer) : nullptr;
+}
+
+void SetTextureBuffer(GLuint texid, GLenum internal_format, GLsizei width,
+ GLsizei height, GLsizei stride, void* buf,
+ GLsizei min_width, GLsizei min_height) {
+ Texture& t = ctx->textures[texid];
+ set_tex_storage(t, internal_format, width, height, buf, stride, min_width,
+ min_height);
+}
+
+GLenum CheckFramebufferStatus(GLenum target) {
+ Framebuffer* fb = get_framebuffer(target);
+ if (!fb || !fb->color_attachment) {
+ return GL_FRAMEBUFFER_UNSUPPORTED;
+ }
+ return GL_FRAMEBUFFER_COMPLETE;
+}
+
+void Clear(GLbitfield mask) {
+ Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER);
+ if ((mask & GL_COLOR_BUFFER_BIT) && fb.color_attachment) {
+ Texture& t = ctx->textures[fb.color_attachment];
+ assert(!t.locked);
+ if (t.internal_format == GL_RGBA8) {
+ uint32_t color = ctx->clearcolor;
+ request_clear<uint32_t>(t, fb.layer, color);
+ } else if (t.internal_format == GL_R8) {
+ uint8_t color = uint8_t((ctx->clearcolor >> 16) & 0xFF);
+ request_clear<uint8_t>(t, fb.layer, color);
+ } else if (t.internal_format == GL_RG8) {
+ uint16_t color = uint16_t((ctx->clearcolor & 0xFF00) |
+ ((ctx->clearcolor >> 16) & 0xFF));
+ request_clear<uint16_t>(t, fb.layer, color);
+ } else {
+ assert(false);
+ }
+ }
+ if ((mask & GL_DEPTH_BUFFER_BIT) && fb.depth_attachment) {
+ Texture& t = ctx->textures[fb.depth_attachment];
+ assert(t.internal_format == GL_DEPTH_COMPONENT16);
+ uint16_t depth = uint16_t(0xFFFF * ctx->cleardepth);
+ if (t.cleared() && clear_requires_scissor(t)) {
+ // If we need to scissor the clear and the depth buffer was already
+ // initialized, then just fill runs for that scissor area.
+ t.fill_depth_runs(depth);
+ } else {
+ // Otherwise, the buffer is either uninitialized or the clear would
+ // encompass the entire buffer. If uninitialized, we can safely fill
+ // the entire buffer with any value and thus ignore any scissoring.
+ t.init_depth_runs(depth);
+ }
+ }
+}
+
+void InvalidateFramebuffer(GLenum target, GLsizei num_attachments,
+ const GLenum* attachments) {
+ Framebuffer* fb = get_framebuffer(target);
+ if (!fb || num_attachments <= 0 || !attachments) {
+ return;
+ }
+ for (GLsizei i = 0; i < num_attachments; i++) {
+ switch (attachments[i]) {
+ case GL_DEPTH_ATTACHMENT: {
+ Texture& t = ctx->textures[fb->depth_attachment];
+ t.set_cleared(false);
+ break;
+ }
+ case GL_COLOR_ATTACHMENT0: {
+ Texture& t = ctx->textures[fb->color_attachment];
+ t.disable_delayed_clear();
+ break;
+ }
+ }
+ }
+}
+
+void ReadPixels(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format,
+ GLenum type, void* data) {
+ data = get_pixel_pack_buffer_data(data);
+ if (!data) return;
+ Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
+ if (!fb) return;
+ assert(format == GL_RED || format == GL_RGBA || format == GL_RGBA_INTEGER ||
+ format == GL_BGRA || format == GL_RG);
+ Texture& t = ctx->textures[fb->color_attachment];
+ if (!t.buf) return;
+ prepare_texture(t);
+ // debugf("read pixels %d, %d, %d, %d from fb %d with format %x\n", x, y,
+ // width, height, ctx->read_framebuffer_binding, t.internal_format);
+ x -= t.offset.x;
+ y -= t.offset.y;
+ assert(x >= 0 && y >= 0);
+ assert(x + width <= t.width);
+ assert(y + height <= t.height);
+ if (internal_format_for_data(format, type) != t.internal_format) {
+ debugf("mismatched format for read pixels: %x vs %x\n", t.internal_format,
+ internal_format_for_data(format, type));
+ assert(false);
+ return;
+ }
+ // Only support readback conversions that are reversible
+ assert(!format_requires_conversion(format, t.internal_format) ||
+ bytes_for_internal_format(format) == t.bpp());
+ uint8_t* dest = (uint8_t*)data;
+ size_t destStride = width * t.bpp();
+ if (y < 0) {
+ dest += -y * destStride;
+ height += y;
+ y = 0;
+ }
+ if (y + height > t.height) {
+ height = t.height - y;
+ }
+ if (x < 0) {
+ dest += -x * t.bpp();
+ width += x;
+ x = 0;
+ }
+ if (x + width > t.width) {
+ width = t.width - x;
+ }
+ if (width <= 0 || height <= 0) {
+ return;
+ }
+ convert_copy(format, t.internal_format, dest, destStride,
+ (const uint8_t*)t.sample_ptr(x, y, fb->layer), t.stride(), width,
+ height);
+}
+
+void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel,
+ GLint srcX, GLint srcY, GLint srcZ, GLuint dstName,
+ GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX,
+ GLint dstY, GLint dstZ, GLsizei srcWidth,
+ GLsizei srcHeight, GLsizei srcDepth) {
+ assert(srcLevel == 0 && dstLevel == 0);
+ if (srcTarget == GL_RENDERBUFFER) {
+ Renderbuffer& rb = ctx->renderbuffers[srcName];
+ srcName = rb.texture;
+ }
+ if (dstTarget == GL_RENDERBUFFER) {
+ Renderbuffer& rb = ctx->renderbuffers[dstName];
+ dstName = rb.texture;
+ }
+ Texture& srctex = ctx->textures[srcName];
+ if (!srctex.buf) return;
+ prepare_texture(srctex);
+ Texture& dsttex = ctx->textures[dstName];
+ if (!dsttex.buf) return;
+ assert(!dsttex.locked);
+ IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight};
+ prepare_texture(dsttex, &skip);
+ assert(srctex.internal_format == dsttex.internal_format);
+ assert(srcWidth >= 0);
+ assert(srcHeight >= 0);
+ assert(srcDepth >= 0);
+ assert(srcX + srcWidth <= srctex.width);
+ assert(srcY + srcHeight <= srctex.height);
+ assert(srcZ + srcDepth <= max(srctex.depth, 1));
+ assert(dstX + srcWidth <= dsttex.width);
+ assert(dstY + srcHeight <= dsttex.height);
+ assert(dstZ + srcDepth <= max(dsttex.depth, 1));
+ int bpp = srctex.bpp();
+ int src_stride = srctex.stride();
+ int dest_stride = dsttex.stride();
+ for (int z = 0; z < srcDepth; z++) {
+ char* dest = dsttex.sample_ptr(dstX, dstY, dstZ + z);
+ char* src = srctex.sample_ptr(srcX, srcY, srcZ + z);
+ for (int y = 0; y < srcHeight; y++) {
+ memcpy(dest, src, srcWidth * bpp);
+ dest += dest_stride;
+ src += src_stride;
+ }
+ }
+}
+
+void CopyTexSubImage3D(GLenum target, UNUSED GLint level, GLint xoffset,
+ GLint yoffset, GLint zoffset, GLint x, GLint y,
+ GLsizei width, GLsizei height) {
+ assert(level == 0);
+ Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
+ if (!fb) return;
+ CopyImageSubData(fb->color_attachment, GL_TEXTURE_3D, 0, x, y, fb->layer,
+ ctx->get_binding(target), GL_TEXTURE_3D, 0, xoffset, yoffset,
+ zoffset, width, height, 1);
+}
+
+void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset,
+ GLint yoffset, GLint x, GLint y, GLsizei width,
+ GLsizei height) {
+ assert(level == 0);
+ Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
+ if (!fb) return;
+ CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D_ARRAY, 0, x, y,
+ fb->layer, ctx->get_binding(target), GL_TEXTURE_2D_ARRAY, 0,
+ xoffset, yoffset, 0, width, height, 1);
+}
+
+} // extern "C"
+
+using ZMask = I32;
+
+static inline PackedRGBA8 convert_zmask(ZMask mask, uint32_t*) {
+ return bit_cast<PackedRGBA8>(mask);
+}
+
+static inline WideR8 convert_zmask(ZMask mask, uint8_t*) {
+ return CONVERT(mask, WideR8);
+}
+
+#if USE_SSE2
+# define ZMASK_NONE_PASSED 0xFFFF
+# define ZMASK_ALL_PASSED 0
+static inline uint32_t zmask_code(ZMask mask) {
+ return _mm_movemask_epi8(mask);
+}
+#else
+# define ZMASK_NONE_PASSED 0xFFFFFFFFU
+# define ZMASK_ALL_PASSED 0
+static inline uint32_t zmask_code(ZMask mask) {
+ return bit_cast<uint32_t>(CONVERT(mask, U8));
+}
+#endif
+
+// Interprets items in the depth buffer as sign-extended 32-bit depth values
+// instead of as runs. Returns a mask that signals which samples in the given
+// chunk passed or failed the depth test with given Z value.
+template <bool DISCARD, typename Z>
+static ALWAYS_INLINE bool check_depth(Z z, DepthRun* zbuf, ZMask& outmask,
+ int span = 4) {
+ // SSE2 does not support unsigned comparison. So ensure Z value is
+ // sign-extended to int32_t.
+ I32 src = I32(z);
+ I32 dest = unaligned_load<I32>(zbuf);
+ // Invert the depth test to check which pixels failed and should be discarded.
+ ZMask mask = ctx->depthfunc == GL_LEQUAL
+ ?
+ // GL_LEQUAL: Not(LessEqual) = Greater
+ ZMask(src > dest)
+ :
+ // GL_LESS: Not(Less) = GreaterEqual
+ ZMask(src >= dest);
+ // Mask off any unused lanes in the span.
+ mask |= ZMask(span) < ZMask{1, 2, 3, 4};
+ if (zmask_code(mask) == ZMASK_NONE_PASSED) {
+ return false;
+ }
+ if (!DISCARD && ctx->depthmask) {
+ unaligned_store(zbuf, (mask & dest) | (~mask & src));
+ }
+ outmask = mask;
+ return true;
+}
+
+static ALWAYS_INLINE I32 packDepth() {
+ return cast(fragment_shader->gl_FragCoord.z * 0xFFFF);
+}
+
+template <typename Z>
+static ALWAYS_INLINE void discard_depth(Z z, DepthRun* zbuf, I32 mask) {
+ if (ctx->depthmask) {
+ I32 src = I32(z);
+ I32 dest = unaligned_load<I32>(zbuf);
+ mask |= fragment_shader->swgl_IsPixelDiscarded;
+ unaligned_store(zbuf, (mask & dest) | (~mask & src));
+ }
+}
+
+static inline HalfRGBA8 packRGBA8(I32 a, I32 b) {
+#if USE_SSE2
+ return _mm_packs_epi32(a, b);
+#elif USE_NEON
+ return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
+#else
+ return CONVERT(combine(a, b), HalfRGBA8);
+#endif
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) {
+ ivec4 i = round_pixel(v);
+ HalfRGBA8 xz = packRGBA8(i.z, i.x);
+ HalfRGBA8 yw = packRGBA8(i.y, i.w);
+ HalfRGBA8 xyzwl = zipLow(xz, yw);
+ HalfRGBA8 xyzwh = zipHigh(xz, yw);
+ HalfRGBA8 lo = zip2Low(xyzwl, xyzwh);
+ HalfRGBA8 hi = zip2High(xyzwl, xyzwh);
+ return combine(lo, hi);
+}
+
+UNUSED static inline WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v) {
+ I32 i = round_pixel((Float){v.z, v.y, v.x, v.w});
+ HalfRGBA8 c = packRGBA8(i, i);
+ return combine(c, c);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8() {
+ return pack_pixels_RGBA8(fragment_shader->gl_FragColor);
+}
+
+// Load a partial span > 0 and < 4 pixels.
+template <typename V, typename P>
+static ALWAYS_INLINE V partial_load_span(const P* src, int span) {
+ return bit_cast<V>(
+ (span >= 2
+ ? combine(unaligned_load<V2<P>>(src),
+ V2<P>{span > 2 ? unaligned_load<P>(src + 2) : P(0), 0})
+ : V4<P>{unaligned_load<P>(src), 0, 0, 0}));
+}
+
+// Store a partial span > 0 and < 4 pixels.
+template <typename V, typename P>
+static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) {
+ auto pixels = bit_cast<V4<P>>(src);
+ if (span >= 2) {
+ unaligned_store(dst, lowHalf(pixels));
+ if (span > 2) {
+ unaligned_store(dst + 2, pixels.z);
+ }
+ } else {
+ unaligned_store(dst, pixels.x);
+ }
+}
+
+// Dispatcher that chooses when to load a full or partial span
+template <typename V, typename P>
+static ALWAYS_INLINE V load_span(const P* src, int span) {
+ if (span >= 4) {
+ return unaligned_load<V, P>(src);
+ } else {
+ return partial_load_span<V, P>(src, span);
+ }
+}
+
+// Dispatcher that chooses when to store a full or partial span
+template <typename V, typename P>
+static ALWAYS_INLINE void store_span(P* dst, V src, int span) {
+ if (span >= 4) {
+ unaligned_store<V, P>(dst, src);
+ } else {
+ partial_store_span<V, P>(dst, src, span);
+ }
+}
+
+// (x*y + x) >> 8, cheap approximation of (x*y) / 255
+template <typename T>
+static inline T muldiv255(T x, T y) {
+ return (x * y + x) >> 8;
+}
+
+// Byte-wise addition for when x or y is a signed 8-bit value stored in the
+// low byte of a larger type T only with zeroed-out high bits, where T is
+// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used
+// upon signed operands, using up all the precision in a 16 bit integer, and
+// potentially losing the sign bit in the last >> 8 shift. Due to the
+// properties of two's complement arithmetic, even though we've discarded the
+// sign bit, we can still represent a negative number under addition (without
+// requiring any extra sign bits), just that any negative number will behave
+// like a large unsigned number under addition, generating a single carry bit
+// on overflow that we need to discard. Thus, just doing a byte-wise add will
+// overflow without the troublesome carry, giving us only the remaining 8 low
+// bits we actually need while keeping the high bits at zero.
+template <typename T>
+static inline T addlow(T x, T y) {
+ typedef VectorType<uint8_t, sizeof(T)> bytes;
+ return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y));
+}
+
+static inline WideRGBA8 alphas(WideRGBA8 c) {
+ return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
+}
+
+// A pointer into the color buffer for the start of the span.
+static void* swgl_SpanBuf = nullptr;
+// A pointer into the clip mask for the start of the span.
+static uint8_t* swgl_ClipMaskBuf = nullptr;
+
+static ALWAYS_INLINE WideR8 expand_clip_mask(uint8_t* buf, WideR8 mask) {
+ return mask;
+}
+static ALWAYS_INLINE WideRGBA8 expand_clip_mask(uint32_t* buf, WideR8 mask) {
+ WideRG8 maskRG = zip(mask, mask);
+ return zip(maskRG, maskRG);
+}
+
+// Loads a chunk of clip masks. The current pointer into the color buffer is
+// used to reconstruct the relative position within the span. From there, the
+// pointer into the clip mask can be generated from the start of the clip mask
+// span.
+template <typename P>
+static ALWAYS_INLINE auto load_clip_mask(P* buf, int span)
+ -> decltype(expand_clip_mask(buf, 0)) {
+ uint8_t* maskBuf = &swgl_ClipMaskBuf[buf - (P*)swgl_SpanBuf];
+ return expand_clip_mask(buf, unpack(load_span<PackedR8>(maskBuf, span)));
+}
+
+static inline WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst,
+ WideRGBA8 src, int span = 4) {
+ WideRGBA8 dst = unpack(pdst);
+ const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF,
+ 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0,
+ 0xFFFF, 0xFFFF, 0xFFFF, 0};
+ const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF,
+ 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF};
+ const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255,
+ 0, 0, 0, 255, 0, 0, 0, 255};
+
+ // Each blend case is preceded by the MASK_ variant. The MASK_ case first
+ // loads the mask values and multiplies the source value by them. After, it
+ // falls through to the normal blending case using the masked source.
+#define BLEND_CASE_KEY(key) \
+ MASK_##key : src = muldiv255(src, load_clip_mask(buf, span)); \
+ FALLTHROUGH; \
+ case key
+#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
+
+ switch (blend_key) {
+ case BLEND_CASE(GL_ONE, GL_ZERO):
+ return src;
+ case BLEND_CASE(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE,
+ GL_ONE_MINUS_SRC_ALPHA):
+ // dst + src.a*(src.rgb1 - dst)
+ // use addlow for signed overflow
+ return addlow(dst, muldiv255(alphas(src), (src | ALPHA_OPAQUE) - dst));
+ case BLEND_CASE(GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
+ return src + dst - muldiv255(dst, alphas(src));
+ case BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR):
+ return dst - muldiv255(dst, src);
+ case BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE):
+ return dst - (muldiv255(dst, src) & RGB_MASK);
+ case BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA):
+ return dst - muldiv255(dst, alphas(src));
+ case BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
+ return muldiv255(src, dst);
+ case BLEND_CASE(GL_ONE, GL_ONE):
+ return src + dst;
+ case BLEND_CASE(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
+ return src + dst - (muldiv255(dst, src) & ALPHA_MASK);
+ case BLEND_CASE(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE):
+ // src*(1-dst.a) + dst*1 = src - src*dst.a + dst
+ return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK);
+ case BLEND_CASE(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR):
+ // src*k + (1-src)*dst = src*k + dst -
+ // src*dst = dst + src*(k - dst) use addlow
+ // for signed overflow
+ return addlow(
+ dst, muldiv255(src, combine(ctx->blendcolor, ctx->blendcolor) - dst));
+ case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
+ WideRGBA8 secondary =
+ pack_pixels_RGBA8(fragment_shader->gl_SecondaryFragColor);
+ return src + dst - muldiv255(dst, secondary);
+ }
+ case MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
+ // We must explicitly handle the masked secondary blend case.
+ // The secondary color as well as the source must be multiplied by the
+ // mask.
+ WideRGBA8 mask = load_clip_mask(buf, span);
+ WideRGBA8 secondary =
+ pack_pixels_RGBA8(fragment_shader->gl_SecondaryFragColor);
+ return muldiv255(src, mask) + dst -
+ muldiv255(dst, muldiv255(secondary, mask));
+ }
+ default:
+ UNREACHABLE;
+ // return src;
+ }
+
+#undef BLEND_CASE
+#undef BLEND_CASE_KEY
+}
+
+template <bool DISCARD, int SPAN>
+static ALWAYS_INLINE void discard_output(uint32_t* buf, PackedRGBA8 mask) {
+ WideRGBA8 r = pack_pixels_RGBA8();
+ PackedRGBA8 dst = load_span<PackedRGBA8>(buf, SPAN);
+ if (blend_key) r = blend_pixels(buf, dst, r, SPAN);
+ if (DISCARD)
+ mask |= bit_cast<PackedRGBA8>(fragment_shader->swgl_IsPixelDiscarded);
+ store_span(buf, (mask & dst) | (~mask & pack(r)), SPAN);
+}
+
+template <bool DISCARD, int SPAN>
+static ALWAYS_INLINE void discard_output(uint32_t* buf) {
+ WideRGBA8 r = pack_pixels_RGBA8();
+ if (DISCARD) {
+ PackedRGBA8 dst = load_span<PackedRGBA8>(buf, SPAN);
+ if (blend_key) r = blend_pixels(buf, dst, r, SPAN);
+ PackedRGBA8 mask =
+ bit_cast<PackedRGBA8>(fragment_shader->swgl_IsPixelDiscarded);
+ store_span(buf, (mask & dst) | (~mask & pack(r)), SPAN);
+ } else {
+ if (blend_key)
+ r = blend_pixels(buf, load_span<PackedRGBA8>(buf, SPAN), r, SPAN);
+ store_span(buf, pack(r), SPAN);
+ }
+}
+
+static inline WideR8 packR8(I32 a) {
+#if USE_SSE2
+ return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a)));
+#elif USE_NEON
+ return vqmovun_s32(a);
+#else
+ return CONVERT(a, WideR8);
+#endif
+}
+
+static inline WideR8 pack_pixels_R8(Float c) { return packR8(round_pixel(c)); }
+
+static inline WideR8 pack_pixels_R8() {
+ return pack_pixels_R8(fragment_shader->gl_FragColor.x);
+}
+
+static inline WideR8 blend_pixels(uint8_t* buf, WideR8 dst, WideR8 src,
+ int span = 4) {
+#define BLEND_CASE_KEY(key) \
+ MASK_##key : src = muldiv255(src, load_clip_mask(buf, span)); \
+ FALLTHROUGH; \
+ case key
+#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
+
+ switch (blend_key) {
+ case BLEND_CASE(GL_ONE, GL_ZERO):
+ return src;
+ case BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
+ return muldiv255(src, dst);
+ case BLEND_CASE(GL_ONE, GL_ONE):
+ return src + dst;
+ default:
+ UNREACHABLE;
+ // return src;
+ }
+
+#undef BLEND_CASE
+#undef BLEND_CASE_KEY
+}
+
+template <bool DISCARD, int SPAN>
+static inline void discard_output(uint8_t* buf, WideR8 mask) {
+ WideR8 r = pack_pixels_R8();
+ WideR8 dst = unpack(load_span<PackedR8>(buf, SPAN));
+ if (blend_key) r = blend_pixels(buf, dst, r, SPAN);
+ if (DISCARD) mask |= packR8(fragment_shader->swgl_IsPixelDiscarded);
+ store_span(buf, pack((mask & dst) | (~mask & r)), SPAN);
+}
+
+template <bool DISCARD, int SPAN>
+static inline void discard_output(uint8_t* buf) {
+ WideR8 r = pack_pixels_R8();
+ if (DISCARD) {
+ WideR8 dst = unpack(load_span<PackedR8>(buf, SPAN));
+ if (blend_key) r = blend_pixels(buf, dst, r, SPAN);
+ WideR8 mask = packR8(fragment_shader->swgl_IsPixelDiscarded);
+ store_span(buf, pack((mask & dst) | (~mask & r)), SPAN);
+ } else {
+ if (blend_key)
+ r = blend_pixels(buf, unpack(load_span<PackedR8>(buf, SPAN)), r, SPAN);
+ store_span(buf, pack(r), SPAN);
+ }
+}
+
+template <bool DISCARD, bool W, typename P, typename M>
+static inline void commit_output(P* buf, M mask) {
+ fragment_shader->run<W>();
+ discard_output<DISCARD, 4>(buf, mask);
+}
+
+template <bool DISCARD, bool W, typename P, typename M>
+static inline void commit_output(P* buf, M mask, int span) {
+ fragment_shader->run<W>();
+ switch (span) {
+ case 1:
+ discard_output<DISCARD, 1>(buf, mask);
+ break;
+ case 2:
+ discard_output<DISCARD, 2>(buf, mask);
+ break;
+ default:
+ discard_output<DISCARD, 3>(buf, mask);
+ break;
+ }
+}
+
+template <bool DISCARD, bool W, typename P>
+static inline void commit_output(P* buf) {
+ fragment_shader->run<W>();
+ discard_output<DISCARD, 4>(buf);
+}
+
+template <bool DISCARD, bool W, typename P>
+static inline void commit_output(P* buf, int span) {
+ fragment_shader->run<W>();
+ switch (span) {
+ case 1:
+ discard_output<DISCARD, 1>(buf);
+ break;
+ case 2:
+ discard_output<DISCARD, 2>(buf);
+ break;
+ default:
+ discard_output<DISCARD, 3>(buf);
+ break;
+ }
+}
+
+template <bool DISCARD, bool W, typename P, typename Z>
+static inline void commit_output(P* buf, Z z, DepthRun* zbuf) {
+ ZMask zmask;
+ if (check_depth<DISCARD>(z, zbuf, zmask)) {
+ commit_output<DISCARD, W>(buf, convert_zmask(zmask, buf));
+ if (DISCARD) {
+ discard_depth(z, zbuf, zmask);
+ }
+ } else {
+ fragment_shader->skip<W>();
+ }
+}
+
+template <bool DISCARD, bool W, typename P, typename Z>
+static inline void commit_output(P* buf, Z z, DepthRun* zbuf, int span) {
+ ZMask zmask;
+ if (check_depth<DISCARD>(z, zbuf, zmask, span)) {
+ commit_output<DISCARD, W>(buf, convert_zmask(zmask, buf), span);
+ if (DISCARD) {
+ discard_depth(z, zbuf, zmask);
+ }
+ }
+}
+
+#include "composite.h"
+#include "swgl_ext.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+#ifdef __clang__
+# pragma GCC diagnostic ignored "-Wunused-private-field"
+#else
+# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+#include "load_shader.h"
+#pragma GCC diagnostic pop
+
+typedef vec2_scalar Point2D;
+typedef vec4_scalar Point3D;
+
+struct ClipRect {
+ float x0;
+ float y0;
+ float x1;
+ float y1;
+
+ explicit ClipRect(const IntRect& i)
+ : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {}
+ explicit ClipRect(const Texture& t) : ClipRect(ctx->apply_scissor(t)) {
+ // If blending is enabled, set blend_key to reflect the resolved blend
+ // state for the currently drawn primitive.
+ if (ctx->blend) {
+ blend_key = ctx->blend_key;
+ // If a clip mask is available, set up blending state to use the clip
+ // mask.
+ if (swgl_ClipMask) {
+ assert(swgl_ClipMask->format == TextureFormat::R8);
+ // Constrain the clip mask bounds to always fall within the clip mask.
+ swgl_ClipMaskBounds.intersect(IntRect{0, 0, int(swgl_ClipMask->width),
+ int(swgl_ClipMask->height)});
+ // The clip mask offset is relative to the viewport.
+ swgl_ClipMaskOffset += ctx->viewport.origin() - t.offset;
+ // The clip mask bounds are relative to the clip mask offset.
+ swgl_ClipMaskBounds.offset(swgl_ClipMaskOffset);
+ // Finally, constrain the clip rectangle by the clip mask bounds.
+ intersect(swgl_ClipMaskBounds);
+ // Modify the blend key so that it will use the clip mask while
+ // blending.
+ blend_key = BlendKey(MASK_BLEND_KEY_NONE + blend_key);
+ }
+ } else {
+ blend_key = BLEND_KEY_NONE;
+ }
+ }
+
+ void intersect(const IntRect& c) {
+ x0 = max(x0, float(c.x0));
+ y0 = max(y0, float(c.y0));
+ x1 = min(x1, float(c.x1));
+ y1 = min(y1, float(c.y1));
+ }
+
+ template <typename P>
+ void init_span(int x, int y, P* buf) const {
+ if (blend_key >= MASK_BLEND_KEY_NONE) {
+ swgl_SpanBuf = buf;
+ swgl_ClipMaskBuf = (uint8_t*)swgl_ClipMask->buf +
+ (y - swgl_ClipMaskOffset.y) * swgl_ClipMask->stride +
+ (x - swgl_ClipMaskOffset.x);
+ }
+ }
+
+ template <typename P>
+ bool overlaps(int nump, const P* p) const {
+ // Generate a mask of which side of the clip rect all of a polygon's points
+ // fall inside of. This is a cheap conservative estimate of whether the
+ // bounding box of the polygon might overlap the clip rect, rather than an
+ // exact test that would require multiple slower line intersections.
+ int sides = 0;
+ for (int i = 0; i < nump; i++) {
+ sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2;
+ sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8;
+ }
+ return sides == 0xF;
+ }
+};
+
+// Converts a run array into a flattened array of depth samples. This just
+// walks through every run and fills the samples with the depth value from
+// the run.
+static void flatten_depth_runs(DepthRun* runs, size_t width) {
+ if (runs->is_flat()) {
+ return;
+ }
+ while (width > 0) {
+ size_t n = runs->count;
+ fill_depth_run(runs, n, runs->depth);
+ runs += n;
+ width -= n;
+ }
+}
+
+// Helper function for drawing passed depth runs within the depth buffer.
+// Flattened depth (perspective or discard) is not supported.
+template <typename P>
+static ALWAYS_INLINE void draw_depth_span(uint16_t z, P* buf,
+ DepthCursor& cursor) {
+ for (;;) {
+ // Get the span that passes the depth test. Assume on entry that
+ // any failed runs have already been skipped.
+ int span = cursor.check_passed(z, ctx->depthfunc, ctx->depthmask);
+ // If nothing passed, since we already skipped passed failed runs
+ // previously, we must have hit the end of the row. Bail out.
+ if (span <= 0) {
+ break;
+ }
+ if (span >= 4) {
+ // If we have a draw specialization, try to process as many 4-pixel
+ // chunks as possible using it.
+ if (fragment_shader->has_draw_span(buf)) {
+ int len = span & ~3;
+ fragment_shader->draw_span(buf, len);
+ buf += len;
+ span &= 3;
+ } else {
+ // Otherwise, just process each chunk individually.
+ while (span >= 4) {
+ commit_output<false, false>(buf);
+ buf += 4;
+ span -= 4;
+ }
+ }
+ }
+ // If we have a partial chunk left over, we still have to process it as if
+ // it were a full chunk. Mask off only the part of the chunk we want to
+ // use.
+ if (span > 0) {
+ commit_output<false, false>(buf, span);
+ buf += span;
+ }
+ // Skip past any runs that fail the depth test.
+ int skip = cursor.skip_failed(z, ctx->depthfunc);
+ // If there aren't any, that means we won't encounter any more passing runs
+ // and so it's safe to bail out.
+ if (skip <= 0) {
+ break;
+ }
+ // Advance interpolants for the fragment shader past the skipped region.
+ // If we processed a partial chunk above, we actually advanced the
+ // interpolants a full chunk in the fragment shader's run function. Thus,
+ // we need to first subtract off that 4-pixel chunk and only partially
+ // advance them to that partial chunk before we can add on the rest of the
+ // skips. This is combined with the skip here for efficiency's sake.
+ fragment_shader->skip(skip - (span > 0 ? 4 - span : 0));
+ buf += skip;
+ }
+}
+
+// Draw a simple span in 4-pixel wide chunks, optionally using depth.
+template <bool DISCARD, bool W, typename P, typename Z>
+static ALWAYS_INLINE void draw_span(P* buf, DepthRun* depth, int span, Z z) {
+ if (depth) {
+ // Depth testing is enabled. If perspective is used, Z values will vary
+ // across the span, we use packDepth to generate 16-bit Z values suitable
+ // for depth testing based on current values from gl_FragCoord.z.
+ // Otherwise, for the no-perspective case, we just use the provided Z.
+ // Process 4-pixel chunks first.
+ for (; span >= 4; span -= 4, buf += 4, depth += 4) {
+ commit_output<DISCARD, W>(buf, z(), depth);
+ }
+ // If there are any remaining pixels, do a partial chunk.
+ if (span > 0) {
+ commit_output<DISCARD, W>(buf, z(), depth, span);
+ }
+ } else {
+ // Process 4-pixel chunks first.
+ for (; span >= 4; span -= 4, buf += 4) {
+ commit_output<DISCARD, W>(buf);
+ }
+ // If there are any remaining pixels, do a partial chunk.
+ if (span > 0) {
+ commit_output<DISCARD, W>(buf, span);
+ }
+ }
+}
+
+// Called during rasterization to forcefully clear a row on which delayed clear
+// has been enabled. If we know that we are going to completely overwrite a part
+// of the row, then we only need to clear the row outside of that part. However,
+// if blending or discard is enabled, the values of that underlying part of the
+// row may be used regardless to produce the final rasterization result, so we
+// have to then clear the entire underlying row to prepare it.
+template <typename P>
+static inline void prepare_row(Texture& colortex, int y, int startx, int endx,
+ bool use_discard, DepthRun* depth,
+ uint16_t z = 0, DepthCursor* cursor = nullptr) {
+ assert(colortex.delay_clear > 0);
+ // Delayed clear is enabled for the color buffer. Check if needs clear.
+ uint32_t& mask = colortex.cleared_rows[y / 32];
+ if ((mask & (1 << (y & 31))) == 0) {
+ mask |= 1 << (y & 31);
+ colortex.delay_clear--;
+ if (blend_key || use_discard) {
+ // If depth test, blending, or discard is used, old color values
+ // might be sampled, so we need to clear the entire row to fill it.
+ force_clear_row<P>(colortex, y);
+ } else if (depth) {
+ if (depth->is_flat() || !cursor) {
+ // If flat depth is used, we can't cheaply predict if which samples will
+ // pass.
+ force_clear_row<P>(colortex, y);
+ } else {
+ // Otherwise if depth runs are used, see how many samples initially pass
+ // the depth test and only fill the row outside those. The fragment
+ // shader will fill the row within the passed samples.
+ int passed =
+ DepthCursor(*cursor).check_passed<false>(z, ctx->depthfunc);
+ if (startx > 0 || startx + passed < colortex.width) {
+ force_clear_row<P>(colortex, y, startx, startx + passed);
+ }
+ }
+ } else if (startx > 0 || endx < colortex.width) {
+ // Otherwise, we only need to clear the row outside of the span.
+ // The fragment shader will fill the row within the span itself.
+ force_clear_row<P>(colortex, y, startx, endx);
+ }
+ }
+}
+
+// Draw spans for each row of a given quad (or triangle) with a constant Z
+// value. The quad is assumed convex. It is clipped to fall within the given
+// clip rect. In short, this function rasterizes a quad by first finding a
+// top most starting point and then from there tracing down the left and right
+// sides of this quad until it hits the bottom, outputting a span between the
+// current left and right positions at each row along the way. Points are
+// assumed to be ordered in either CW or CCW to support this, but currently
+// both orders (CW and CCW) are supported and equivalent.
+template <typename P>
+static inline void draw_quad_spans(int nump, Point2D p[4], uint16_t z,
+ Interpolants interp_outs[4],
+ Texture& colortex, int layer,
+ Texture& depthtex,
+ const ClipRect& clipRect) {
+ // Only triangles and convex quads supported.
+ assert(nump == 3 || nump == 4);
+ Point2D l0, r0, l1, r1;
+ int l0i, r0i, l1i, r1i;
+ {
+ // Find the index of the top-most (smallest Y) point from which
+ // rasterization can start.
+ int top = nump > 3 && p[3].y < p[2].y
+ ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3)
+ : (p[1].y < p[3].y ? 1 : 3))
+ : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2)
+ : (p[1].y < p[2].y ? 1 : 2));
+ // Helper to find next index in the points array, walking forward.
+#define NEXT_POINT(idx) \
+ ({ \
+ int cur = (idx) + 1; \
+ cur < nump ? cur : 0; \
+ })
+ // Helper to find the previous index in the points array, walking backward.
+#define PREV_POINT(idx) \
+ ({ \
+ int cur = (idx)-1; \
+ cur >= 0 ? cur : nump - 1; \
+ })
+ // Start looking for "left"-side and "right"-side descending edges starting
+ // from the determined top point.
+ int next = NEXT_POINT(top);
+ int prev = PREV_POINT(top);
+ if (p[top].y == p[next].y) {
+ // If the next point is on the same row as the top, then advance one more
+ // time to the next point and use that as the "left" descending edge.
+ l0i = next;
+ l1i = NEXT_POINT(next);
+ // Assume top and prev form a descending "right" edge, as otherwise this
+ // will be a collapsed polygon and harmlessly bail out down below.
+ r0i = top;
+ r1i = prev;
+ } else if (p[top].y == p[prev].y) {
+ // If the prev point is on the same row as the top, then advance to the
+ // prev again and use that as the "right" descending edge.
+ // Assume top and next form a non-empty descending "left" edge.
+ l0i = top;
+ l1i = next;
+ r0i = prev;
+ r1i = PREV_POINT(prev);
+ } else {
+ // Both next and prev are on distinct rows from top, so both "left" and
+ // "right" edges are non-empty/descending.
+ l0i = r0i = top;
+ l1i = next;
+ r1i = prev;
+ }
+ // Load the points from the indices.
+ l0 = p[l0i]; // Start of left edge
+ r0 = p[r0i]; // End of left edge
+ l1 = p[l1i]; // Start of right edge
+ r1 = p[r1i]; // End of right edge
+ // debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1:
+ // %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i,
+ // r1.x, r1.y);
+ }
+
+ struct Edge {
+ float yScale;
+ float xSlope;
+ float x;
+ Interpolants interpSlope;
+ Interpolants interp;
+
+ Edge(float y, const Point2D& p0, const Point2D& p1, const Interpolants& i0,
+ const Interpolants& i1)
+ : // Inverse Y scale for slope calculations. Avoid divide on 0-length
+ // edge. Later checks below ensure that Y <= p1.y, or otherwise we
+ // don't use this edge. We just need to guard against Y == p1.y ==
+ // p0.y. In that case, Y - p0.y == 0 and will cancel out the slopes
+ // below, except if yScale is Inf for some reason (or worse, NaN),
+ // which 1/(p1.y-p0.y) might produce if we don't bound it.
+ yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
+ // Calculate dX/dY slope
+ xSlope((p1.x - p0.x) * yScale),
+ // Initialize current X based on Y and slope
+ x(p0.x + (y - p0.y) * xSlope),
+ // Calculate change in interpolants per change in Y
+ interpSlope((i1 - i0) * yScale),
+ // Initialize current interpolants based on Y and slope
+ interp(i0 + (y - p0.y) * interpSlope) {}
+
+ void nextRow() {
+ // step current X and interpolants to next row from slope
+ x += xSlope;
+ interp += interpSlope;
+ }
+ };
+
+ // Vertex selection above should result in equal left and right start rows
+ assert(l0.y == r0.y);
+ // Find the start y, clip to within the clip rect, and round to row center.
+ float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f;
+ // Initialize left and right edges from end points and start Y
+ Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+ Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+ // Get pointer to color buffer and depth buffer at current Y
+ P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer);
+ DepthRun* fdepth = (DepthRun*)depthtex.sample_ptr(0, int(y));
+ // Loop along advancing Ys, rasterizing spans at each row
+ float checkY = min(min(l1.y, r1.y), clipRect.y1);
+ for (;;) {
+ // Check if we maybe passed edge ends or outside clip rect...
+ if (y > checkY) {
+ // If we're outside the clip rect, we're done.
+ if (y > clipRect.y1) break;
+ // Helper to find the next non-duplicate vertex that doesn't loop back.
+#define STEP_EDGE(e0i, e0, e1i, e1, STEP_POINT, end) \
+ for (;;) { \
+ /* Set new start of edge to be end of old edge */ \
+ e0i = e1i; \
+ e0 = e1; \
+ /* Set new end of edge to next point */ \
+ e1i = STEP_POINT(e1i); \
+ e1 = p[e1i]; \
+ /* If the edge is descending, use it. */ \
+ if (e1.y > e0.y) break; \
+ /* If the edge is ascending or crossed the end, we're done. */ \
+ if (e1.y < e0.y || e0i == end) return; \
+ /* Otherwise, it's a duplicate, so keep searching. */ \
+ }
+ // Check if Y advanced past the end of the left edge
+ if (y > l1.y) {
+ // Step to next left edge past Y and reset edge interpolants.
+ do {
+ STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i);
+ } while (y > l1.y);
+ left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+ }
+ // Check if Y advanced past the end of the right edge
+ if (y > r1.y) {
+ // Step to next right edge past Y and reset edge interpolants.
+ do {
+ STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i);
+ } while (y > r1.y);
+ right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+ }
+ // Reset check condition for next time around.
+ checkY = min(min(l1.y, r1.y), clipRect.y1);
+ }
+ // lx..rx form the bounds of the span. WR does not use backface culling,
+ // so we need to use min/max to support the span in either orientation.
+ // Clip the span to fall within the clip rect and then round to nearest
+ // column.
+ int startx = int(max(min(left.x, right.x), clipRect.x0) + 0.5f);
+ int endx = int(min(max(left.x, right.x), clipRect.x1) + 0.5f);
+ // Check if span is non-empty.
+ int span = endx - startx;
+ if (span > 0) {
+ ctx->shaded_rows++;
+ ctx->shaded_pixels += span;
+ // Advance color/depth buffer pointers to the start of the span.
+ P* buf = fbuf + startx;
+ // Check if the we will need to use depth-buffer or discard on this span.
+ DepthRun* depth =
+ depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr;
+ DepthCursor cursor;
+ bool use_discard = fragment_shader->use_discard();
+ if (use_discard) {
+ if (depth) {
+ // If we're using discard, we may have to unpredictably drop out some
+ // samples. Flatten the depth run array here to allow this.
+ if (!depth->is_flat()) {
+ flatten_depth_runs(depth, depthtex.width);
+ }
+ // Advance to the depth sample at the start of the span.
+ depth += startx;
+ }
+ } else if (depth) {
+ if (!depth->is_flat()) {
+ // We're not using discard and the depth row is still organized into
+ // runs. Skip past any runs that would fail the depth test so we
+ // don't have to do any extra work to process them with the rest of
+ // the span.
+ cursor = DepthCursor(depth, depthtex.width, startx, span);
+ int skipped = cursor.skip_failed(z, ctx->depthfunc);
+ // If we fell off the row, that means we couldn't find any passing
+ // runs. We can just skip the entire span.
+ if (skipped < 0) {
+ goto next_span;
+ }
+ buf += skipped;
+ startx += skipped;
+ span -= skipped;
+ } else {
+ // The row is already flattened, so just advance to the span start.
+ depth += startx;
+ }
+ }
+
+ if (colortex.delay_clear) {
+ // Delayed clear is enabled for the color buffer. Check if needs clear.
+ prepare_row<P>(colortex, int(y), startx, endx, use_discard, depth, z,
+ &cursor);
+ }
+
+ // Initialize fragment shader interpolants to current span position.
+ fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1);
+ fragment_shader->gl_FragCoord.y = y;
+ {
+ // Change in interpolants is difference between current right and left
+ // edges per the change in right and left X.
+ Interpolants step =
+ (right.interp - left.interp) * (1.0f / (right.x - left.x));
+ // Advance current interpolants to X at start of span.
+ Interpolants o = left.interp + step * (startx + 0.5f - left.x);
+ fragment_shader->init_span(&o, &step);
+ }
+ clipRect.init_span(startx, y, buf);
+ if (!use_discard) {
+ // Fast paths for the case where fragment discard is not used.
+ if (depth) {
+ // If depth is used, we want to process entire depth runs if depth is
+ // not flattened.
+ if (!depth->is_flat()) {
+ draw_depth_span(z, buf, cursor);
+ goto next_span;
+ }
+ // Otherwise, flattened depth must fall back to the slightly slower
+ // per-chunk depth test path in draw_span below.
+ } else {
+ // Check if the fragment shader has an optimized draw specialization.
+ if (span >= 4 && fragment_shader->has_draw_span(buf)) {
+ // Draw specialization expects 4-pixel chunks.
+ int len = span & ~3;
+ fragment_shader->draw_span(buf, len);
+ buf += len;
+ span &= 3;
+ }
+ }
+ draw_span<false, false>(buf, depth, span, [=] { return z; });
+ } else {
+ // If discard is used, then use slower fallbacks. This should be rare.
+ // Just needs to work, doesn't need to be too fast yet...
+ draw_span<true, false>(buf, depth, span, [=] { return z; });
+ }
+ }
+ next_span:
+ // Advance Y and edge interpolants to next row.
+ y++;
+ left.nextRow();
+ right.nextRow();
+ // Advance buffers to next row.
+ fbuf += colortex.stride() / sizeof(P);
+ fdepth += depthtex.stride() / sizeof(DepthRun);
+ }
+}
+
+// Draw perspective-correct spans for a convex quad that has been clipped to
+// the near and far Z planes, possibly producing a clipped convex polygon with
+// more than 4 sides. This assumes the Z value will vary across the spans and
+// requires interpolants to factor in W values. This tends to be slower than
+// the simpler 2D draw_quad_spans above, especially since we can't optimize the
+// depth test easily when Z values, and should be used only rarely if possible.
+template <typename P>
+static inline void draw_perspective_spans(int nump, Point3D* p,
+ Interpolants* interp_outs,
+ Texture& colortex, int layer,
+ Texture& depthtex,
+ const ClipRect& clipRect) {
+ Point3D l0, r0, l1, r1;
+ int l0i, r0i, l1i, r1i;
+ {
+ // Find the index of the top-most point (smallest Y) from which
+ // rasterization can start.
+ int top = 0;
+ for (int i = 1; i < nump; i++) {
+ if (p[i].y < p[top].y) {
+ top = i;
+ }
+ }
+ // Find left-most top point, the start of the left descending edge.
+ // Advance forward in the points array, searching at most nump points
+ // in case the polygon is flat.
+ l0i = top;
+ for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) {
+ l0i = i;
+ }
+ if (l0i == nump - 1) {
+ for (int i = 0; i <= top && p[i].y == p[top].y; i++) {
+ l0i = i;
+ }
+ }
+ // Find right-most top point, the start of the right descending edge.
+ // Advance backward in the points array, searching at most nump points.
+ r0i = top;
+ for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) {
+ r0i = i;
+ }
+ if (r0i == 0) {
+ for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) {
+ r0i = i;
+ }
+ }
+ // End of left edge is next point after left edge start.
+ l1i = NEXT_POINT(l0i);
+ // End of right edge is prev point after right edge start.
+ r1i = PREV_POINT(r0i);
+ l0 = p[l0i]; // Start of left edge
+ r0 = p[r0i]; // End of left edge
+ l1 = p[l1i]; // Start of right edge
+ r1 = p[r1i]; // End of right edge
+ }
+
+ struct Edge {
+ float yScale;
+ // Current coordinates for edge. Where in the 2D case of draw_quad_spans,
+ // it is enough to just track the X coordinate as we advance along the rows,
+ // for the perspective case we also need to keep track of Z and W. For
+ // simplicity, we just use the full 3D point to track all these coordinates.
+ Point3D pSlope;
+ Point3D p;
+ Interpolants interpSlope;
+ Interpolants interp;
+
+ Edge(float y, const Point3D& p0, const Point3D& p1, const Interpolants& i0,
+ const Interpolants& i1)
+ : // Inverse Y scale for slope calculations. Avoid divide on 0-length
+ // edge.
+ yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
+ // Calculate dX/dY slope
+ pSlope((p1 - p0) * yScale),
+ // Initialize current coords based on Y and slope
+ p(p0 + (y - p0.y) * pSlope),
+ // Crucially, these interpolants must be scaled by the point's 1/w
+ // value, which allows linear interpolation in a perspective-correct
+ // manner. This will be canceled out inside the fragment shader later.
+ // Calculate change in interpolants per change in Y
+ interpSlope((i1 * p1.w - i0 * p0.w) * yScale),
+ // Initialize current interpolants based on Y and slope
+ interp(i0 * p0.w + (y - p0.y) * interpSlope) {}
+
+ float x() const { return p.x; }
+ vec2_scalar zw() const { return {p.z, p.w}; }
+
+ void nextRow() {
+ // step current coords and interpolants to next row from slope
+ p += pSlope;
+ interp += interpSlope;
+ }
+ };
+
+ // Vertex selection above should result in equal left and right start rows
+ assert(l0.y == r0.y);
+ // Find the start y, clip to within the clip rect, and round to row center.
+ float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f;
+ // Initialize left and right edges from end points and start Y
+ Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+ Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+ // Get pointer to color buffer and depth buffer at current Y
+ P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer);
+ DepthRun* fdepth = (DepthRun*)depthtex.sample_ptr(0, int(y));
+ // Loop along advancing Ys, rasterizing spans at each row
+ float checkY = min(min(l1.y, r1.y), clipRect.y1);
+ for (;;) {
+ // Check if we maybe passed edge ends or outside clip rect...
+ if (y > checkY) {
+ // If we're outside the clip rect, we're done.
+ if (y > clipRect.y1) break;
+ // Check if Y advanced past the end of the left edge
+ if (y > l1.y) {
+ // Step to next left edge past Y and reset edge interpolants.
+ do {
+ STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i);
+ } while (y > l1.y);
+ left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+ }
+ // Check if Y advanced past the end of the right edge
+ if (y > r1.y) {
+ // Step to next right edge past Y and reset edge interpolants.
+ do {
+ STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i);
+ } while (y > r1.y);
+ right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+ }
+ // Reset check condition for next time around.
+ checkY = min(min(l1.y, r1.y), clipRect.y1);
+ }
+ // lx..rx form the bounds of the span. WR does not use backface culling,
+ // so we need to use min/max to support the span in either orientation.
+ // Clip the span to fall within the clip rect and then round to nearest
+ // column.
+ int startx = int(max(min(left.x(), right.x()), clipRect.x0) + 0.5f);
+ int endx = int(min(max(left.x(), right.x()), clipRect.x1) + 0.5f);
+ // Check if span is non-empty.
+ int span = endx - startx;
+ if (span > 0) {
+ ctx->shaded_rows++;
+ ctx->shaded_pixels += span;
+ // Advance color/depth buffer pointers to the start of the span.
+ P* buf = fbuf + startx;
+ // Check if the we will need to use depth-buffer or discard on this span.
+ DepthRun* depth =
+ depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr;
+ bool use_discard = fragment_shader->use_discard();
+ if (depth) {
+ // Perspective may cause the depth value to vary on a per sample basis.
+ // Ensure the depth row is flattened to allow testing of individual
+ // samples
+ if (!depth->is_flat()) {
+ flatten_depth_runs(depth, depthtex.width);
+ }
+ // Advance to the depth sample at the start of the span.
+ depth += startx;
+ }
+ if (colortex.delay_clear) {
+ // Delayed clear is enabled for the color buffer. Check if needs clear.
+ prepare_row<P>(colortex, int(y), startx, endx, use_discard, depth);
+ }
+ // Initialize fragment shader interpolants to current span position.
+ fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1);
+ fragment_shader->gl_FragCoord.y = y;
+ {
+ // Calculate the fragment Z and W change per change in fragment X step.
+ vec2_scalar stepZW =
+ (right.zw() - left.zw()) * (1.0f / (right.x() - left.x()));
+ // Calculate initial Z and W values for span start.
+ vec2_scalar zw = left.zw() + stepZW * (startx + 0.5f - left.x());
+ // Set fragment shader's Z and W values so that it can use them to
+ // cancel out the 1/w baked into the interpolants.
+ fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x);
+ fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y);
+ fragment_shader->swgl_StepZW = stepZW;
+ // Change in interpolants is difference between current right and left
+ // edges per the change in right and left X. The left and right
+ // interpolant values were previously multipled by 1/w, so the step and
+ // initial span values take this into account.
+ Interpolants step =
+ (right.interp - left.interp) * (1.0f / (right.x() - left.x()));
+ // Advance current interpolants to X at start of span.
+ Interpolants o = left.interp + step * (startx + 0.5f - left.x());
+ fragment_shader->init_span<true>(&o, &step);
+ }
+ clipRect.init_span(startx, y, buf);
+ if (!use_discard) {
+ // No discard is used. Common case.
+ draw_span<false, true>(buf, depth, span, packDepth);
+ } else {
+ // Discard is used. Rare.
+ draw_span<true, true>(buf, depth, span, packDepth);
+ }
+ }
+ // Advance Y and edge interpolants to next row.
+ y++;
+ left.nextRow();
+ right.nextRow();
+ // Advance buffers to next row.
+ fbuf += colortex.stride() / sizeof(P);
+ fdepth += depthtex.stride() / sizeof(DepthRun);
+ }
+}
+
+// Clip a primitive against both sides of a view-frustum axis, producing
+// intermediate vertexes with interpolated attributes that will no longer
+// intersect the selected axis planes. This assumes the primitive is convex
+// and should produce at most N+2 vertexes for each invocation (only in the
+// worst case where one point falls outside on each of the opposite sides
+// with the rest of the points inside).
+template <XYZW AXIS>
+static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP,
+ Interpolants* outInterp) {
+ // Potential mask bits of which side of a plane a coordinate falls on.
+ enum SIDE { POSITIVE = 1, NEGATIVE = 2 };
+ int numClip = 0;
+ Point3D prev = p[nump - 1];
+ Interpolants prevInterp = interp[nump - 1];
+ float prevCoord = prev.select(AXIS);
+ // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and
+ // if so, remember which side it is outside of. In the special case that W is
+ // negative and |C| < |W|, both -W <= C and C <= W will be false, such that
+ // we must consider the coordinate as falling outside of both plane sides
+ // simultaneously. We test each condition separately and combine them to form
+ // a mask of which plane sides we exceeded. If we neglect to consider both
+ // sides simultaneously, points can erroneously oscillate from one plane side
+ // to the other and exceed the supported maximum number of clip outputs.
+ int prevMask = (prevCoord < -prev.w ? NEGATIVE : 0) |
+ (prevCoord > prev.w ? POSITIVE : 0);
+ // Loop through points, finding edges that cross the planes by evaluating
+ // the side at each point.
+ for (int i = 0; i < nump; i++) {
+ Point3D cur = p[i];
+ Interpolants curInterp = interp[i];
+ float curCoord = cur.select(AXIS);
+ int curMask =
+ (curCoord < -cur.w ? NEGATIVE : 0) | (curCoord > cur.w ? POSITIVE : 0);
+ // Check if the previous and current end points are on different sides. If
+ // the masks of sides intersect, then we consider them to be on the same
+ // side. So in the case the masks do not intersect, we then consider them
+ // to fall on different sides.
+ if (!(curMask & prevMask)) {
+ // One of the edge's end points is outside the plane with the other
+ // inside the plane. Find the offset where it crosses the plane and
+ // adjust the point and interpolants to there.
+ if (prevMask) {
+ // Edge that was previously outside crosses inside.
+ // Evaluate plane equation for previous and current end-point
+ // based on previous side and calculate relative offset.
+ if (numClip >= nump + 2) {
+ // If for some reason we produced more vertexes than we support, just
+ // bail out.
+ assert(false);
+ return 0;
+ }
+ // The positive plane is assigned the sign 1, and the negative plane is
+ // assigned -1. If the point falls outside both planes, that means W is
+ // negative. To compensate for this, we must interpolate the coordinate
+ // till W=0, at which point we can choose a single plane side for the
+ // coordinate to fall on since W will no longer be negative. To compute
+ // the coordinate where W=0, we compute K = prev.w / (prev.w-cur.w) and
+ // interpolate C = prev.C + K*(cur.C - prev.C). The sign of C will be
+ // the side of the plane we need to consider. Substituting K into the
+ // comparison C < 0, we can then avoid the division in K with a
+ // cross-multiplication.
+ float prevSide =
+ (prevMask & NEGATIVE) && (!(prevMask & POSITIVE) ||
+ prevCoord * (cur.w - prev.w) <
+ prev.w * (curCoord - prevCoord))
+ ? -1
+ : 1;
+ float prevDist = prevCoord - prevSide * prev.w;
+ float curDist = curCoord - prevSide * cur.w;
+ // It may happen that after we interpolate by the weight k that due to
+ // floating point rounding we've underestimated the value necessary to
+ // push it over the clipping boundary. Just in case, nudge the mantissa
+ // by a single increment so that we essentially round it up and move it
+ // further inside the clipping boundary. We use nextafter to do this in
+ // a portable fashion.
+ float k = prevDist / (prevDist - curDist);
+ Point3D clipped = prev + (cur - prev) * k;
+ if (prevSide * clipped.select(AXIS) > clipped.w) {
+ k = nextafterf(k, 1.0f);
+ clipped = prev + (cur - prev) * k;
+ }
+ outP[numClip] = clipped;
+ outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
+ numClip++;
+ }
+ if (curMask) {
+ // Edge that was previously inside crosses outside.
+ // Evaluate plane equation for previous and current end-point
+ // based on current side and calculate relative offset.
+ if (numClip >= nump + 2) {
+ assert(false);
+ return 0;
+ }
+ // In the case the coordinate falls on both plane sides, the computation
+ // here is much the same as for prevSide, but since we are going from a
+ // previous W that is positive to current W that is negative, then the
+ // sign of cur.w - prev.w will flip in the equation. The resulting sign
+ // is negated to compensate for this.
+ float curSide =
+ (curMask & POSITIVE) && (!(curMask & NEGATIVE) ||
+ prevCoord * (cur.w - prev.w) <
+ prev.w * (curCoord - prevCoord))
+ ? 1
+ : -1;
+ float prevDist = prevCoord - curSide * prev.w;
+ float curDist = curCoord - curSide * cur.w;
+ // Calculate interpolation weight k and the nudge it inside clipping
+ // boundary with nextafter. Note that since we were previously inside
+ // and now crossing outside, we have to flip the nudge direction for
+ // the weight towards 0 instead of 1.
+ float k = prevDist / (prevDist - curDist);
+ Point3D clipped = prev + (cur - prev) * k;
+ if (curSide * clipped.select(AXIS) > clipped.w) {
+ k = nextafterf(k, 0.0f);
+ clipped = prev + (cur - prev) * k;
+ }
+ outP[numClip] = clipped;
+ outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
+ numClip++;
+ }
+ }
+ if (!curMask) {
+ // The current end point is inside the plane, so output point unmodified.
+ if (numClip >= nump + 2) {
+ assert(false);
+ return 0;
+ }
+ outP[numClip] = cur;
+ outInterp[numClip] = curInterp;
+ numClip++;
+ }
+ prev = cur;
+ prevInterp = curInterp;
+ prevCoord = curCoord;
+ prevMask = curMask;
+ }
+ return numClip;
+}
+
+// Helper function to dispatch to perspective span drawing with points that
+// have already been transformed and clipped.
+static inline void draw_perspective_clipped(int nump, Point3D* p_clip,
+ Interpolants* interp_clip,
+ Texture& colortex, int layer,
+ Texture& depthtex) {
+ // If polygon is ouside clip rect, nothing to draw.
+ ClipRect clipRect(colortex);
+ if (!clipRect.overlaps(nump, p_clip)) {
+ return;
+ }
+
+ // Finally draw perspective-correct spans for the polygon.
+ if (colortex.internal_format == GL_RGBA8) {
+ draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex, layer,
+ depthtex, clipRect);
+ } else if (colortex.internal_format == GL_R8) {
+ draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex, layer,
+ depthtex, clipRect);
+ } else {
+ assert(false);
+ }
+}
+
+// Draws a perspective-correct 3D primitive with varying Z value, as opposed
+// to a simple 2D planar primitive with a constant Z value that could be
+// trivially Z rejected. This requires clipping the primitive against the near
+// and far planes to ensure it stays within the valid Z-buffer range. The Z
+// and W of each fragment of the primitives are interpolated across the
+// generated spans and then depth-tested as appropriate.
+// Additionally, vertex attributes must be interpolated with perspective-
+// correction by dividing by W before interpolation, and then later multiplied
+// by W again to produce the final correct attribute value for each fragment.
+// This process is expensive and should be avoided if possible for primitive
+// batches that are known ahead of time to not need perspective-correction.
+static void draw_perspective(int nump, Interpolants interp_outs[4],
+ Texture& colortex, int layer, Texture& depthtex) {
+ // Lines are not supported with perspective.
+ assert(nump >= 3);
+ // Convert output of vertex shader to screen space.
+ vec4 pos = vertex_shader->gl_Position;
+ vec3_scalar scale =
+ vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f;
+ vec3_scalar offset =
+ make_vec3(make_vec2(ctx->viewport.origin() - colortex.offset), 0.0f) +
+ scale;
+ if (test_none(pos.z <= -pos.w || pos.z >= pos.w)) {
+ // No points cross the near or far planes, so no clipping required.
+ // Just divide coords by W and convert to viewport. We assume the W
+ // coordinate is non-zero and the reciprocal is finite since it would
+ // otherwise fail the test_none condition.
+ Float w = 1.0f / pos.w;
+ vec3 screen = pos.sel(X, Y, Z) * w * scale + offset;
+ Point3D p[4] = {{screen.x.x, screen.y.x, screen.z.x, w.x},
+ {screen.x.y, screen.y.y, screen.z.y, w.y},
+ {screen.x.z, screen.y.z, screen.z.z, w.z},
+ {screen.x.w, screen.y.w, screen.z.w, w.w}};
+ draw_perspective_clipped(nump, p, interp_outs, colortex, layer, depthtex);
+ } else {
+ // Points cross the near or far planes, so we need to clip.
+ // Start with the original 3 or 4 points...
+ Point3D p[4] = {{pos.x.x, pos.y.x, pos.z.x, pos.w.x},
+ {pos.x.y, pos.y.y, pos.z.y, pos.w.y},
+ {pos.x.z, pos.y.z, pos.z.z, pos.w.z},
+ {pos.x.w, pos.y.w, pos.z.w, pos.w.w}};
+ // Clipping can expand the points by 1 for each of 6 view frustum planes.
+ Point3D p_clip[4 + 6];
+ Interpolants interp_clip[4 + 6];
+ // Clip against near and far Z planes.
+ nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip);
+ // If no points are left inside the view frustum, there's nothing to draw.
+ if (nump < 3) {
+ return;
+ }
+ // After clipping against only the near and far planes, we might still
+ // produce points where W = 0, exactly at the camera plane. OpenGL specifies
+ // that for clip coordinates, points must satisfy:
+ // -W <= X <= W
+ // -W <= Y <= W
+ // -W <= Z <= W
+ // When Z = W = 0, this is trivially satisfied, but when we transform and
+ // divide by W below it will produce a divide by 0. Usually we want to only
+ // clip Z to avoid the extra work of clipping X and Y. We can still project
+ // points that fall outside the view frustum X and Y so long as Z is valid.
+ // The span drawing code will then ensure X and Y are clamped to viewport
+ // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y,
+ // will push W further inside the view frustum so that it is no longer 0,
+ // allowing us to finally proceed to projecting the points to the screen.
+ for (int i = 0; i < nump; i++) {
+ // Found an invalid W, so need to clip against X and Y...
+ if (p_clip[i].w <= 0.0f) {
+ // Ping-pong p_clip -> p_tmp -> p_clip.
+ Point3D p_tmp[4 + 6];
+ Interpolants interp_tmp[4 + 6];
+ nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp);
+ if (nump < 3) return;
+ nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip);
+ if (nump < 3) return;
+ // After clipping against X and Y planes, there's still points left
+ // to draw, so proceed to trying projection now...
+ break;
+ }
+ }
+ // Divide coords by W and convert to viewport.
+ for (int i = 0; i < nump; i++) {
+ float w = 1.0f / p_clip[i].w;
+ // If the W coord is essentially zero, small enough that division would
+ // result in Inf/NaN, then just set the reciprocal itself to zero so that
+ // the coordinates becomes zeroed out, as the only valid point that
+ // satisfies -W <= X/Y/Z <= W is all zeroes.
+ if (!isfinite(w)) w = 0.0f;
+ p_clip[i] = Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w);
+ }
+ draw_perspective_clipped(nump, p_clip, interp_clip, colortex, layer,
+ depthtex);
+ }
+}
+
+static void draw_quad(int nump, Texture& colortex, int layer,
+ Texture& depthtex) {
+ // Run vertex shader once for the primitive's vertices.
+ // Reserve space for 6 sets of interpolants, in case we need to clip against
+ // near and far planes in the perspective case.
+ Interpolants interp_outs[4];
+ swgl_ClipMask = nullptr;
+ vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants));
+ vec4 pos = vertex_shader->gl_Position;
+ // Check if any vertex W is different from another. If so, use perspective.
+ if (test_any(pos.w != pos.w.x)) {
+ draw_perspective(nump, interp_outs, colortex, layer, depthtex);
+ return;
+ }
+
+ // Convert output of vertex shader to screen space.
+ // Divide coords by W and convert to viewport.
+ float w = 1.0f / pos.w.x;
+ // If the W coord is essentially zero, small enough that division would
+ // result in Inf/NaN, then just set the reciprocal itself to zero so that
+ // the coordinates becomes zeroed out, as the only valid point that
+ // satisfies -W <= X/Y/Z <= W is all zeroes.
+ if (!isfinite(w)) w = 0.0f;
+ vec2 screen = (pos.sel(X, Y) * w + 1) * 0.5f *
+ vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) +
+ make_vec2(ctx->viewport.origin() - colortex.offset);
+ Point2D p[4] = {{screen.x.x, screen.y.x},
+ {screen.x.y, screen.y.y},
+ {screen.x.z, screen.y.z},
+ {screen.x.w, screen.y.w}};
+
+ // If quad is ouside clip rect, nothing to draw.
+ ClipRect clipRect(colortex);
+ if (!clipRect.overlaps(nump, p)) {
+ return;
+ }
+
+ // Since the quad is assumed 2D, Z is constant across the quad.
+ float screenZ = (pos.z.x * w + 1) * 0.5f;
+ if (screenZ < 0 || screenZ > 1) {
+ // Z values would cross the near or far plane, so just bail.
+ return;
+ }
+ // Since Z doesn't need to be interpolated, just set the fragment shader's
+ // Z and W values here, once and for all fragment shader invocations.
+ uint16_t z = uint16_t(0xFFFF * screenZ);
+ fragment_shader->gl_FragCoord.z = screenZ;
+ fragment_shader->gl_FragCoord.w = w;
+
+ // If supplied a line, adjust it so that it is a quad at least 1 pixel thick.
+ // Assume that for a line that all 4 SIMD lanes were actually filled with
+ // vertexes 0, 1, 1, 0.
+ if (nump == 2) {
+ // Nudge Y height to span at least 1 pixel by advancing to next pixel
+ // boundary so that we step at least 1 row when drawing spans.
+ if (int(p[0].y + 0.5f) == int(p[1].y + 0.5f)) {
+ p[2].y = 1 + int(p[1].y + 0.5f);
+ p[3].y = p[2].y;
+ // Nudge X width to span at least 1 pixel so that rounded coords fall on
+ // separate pixels.
+ if (int(p[0].x + 0.5f) == int(p[1].x + 0.5f)) {
+ p[1].x += 1.0f;
+ p[2].x += 1.0f;
+ }
+ } else {
+ // If the line already spans at least 1 row, then assume line is vertical
+ // or diagonal and just needs to be dilated horizontally.
+ p[2].x += 1.0f;
+ p[3].x += 1.0f;
+ }
+ // Pretend that it's a quad now...
+ nump = 4;
+ }
+
+ // Finally draw 2D spans for the quad. Currently only supports drawing to
+ // RGBA8 and R8 color buffers.
+ if (colortex.internal_format == GL_RGBA8) {
+ draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, layer,
+ depthtex, clipRect);
+ } else if (colortex.internal_format == GL_R8) {
+ draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, layer, depthtex,
+ clipRect);
+ } else {
+ assert(false);
+ }
+}
+
+void VertexArray::validate() {
+ int last_enabled = -1;
+ for (int i = 0; i <= max_attrib; i++) {
+ VertexAttrib& attr = attribs[i];
+ if (attr.enabled) {
+ // VertexArray &v = ctx->vertex_arrays[attr.vertex_array];
+ Buffer& vertex_buf = ctx->buffers[attr.vertex_buffer];
+ attr.buf = vertex_buf.buf;
+ attr.buf_size = vertex_buf.size;
+ // debugf("%d %x %d %d %d %d\n", i, attr.type, attr.size, attr.stride,
+ // attr.offset, attr.divisor);
+ last_enabled = i;
+ }
+ }
+ max_attrib = last_enabled;
+}
+
+template <typename INDEX>
+static inline void draw_elements(GLsizei count, GLsizei instancecount,
+ size_t offset, VertexArray& v,
+ Texture& colortex, int layer,
+ Texture& depthtex) {
+ Buffer& indices_buf = ctx->buffers[v.element_array_buffer_binding];
+ if (!indices_buf.buf || offset >= indices_buf.size) {
+ return;
+ }
+ assert((offset & (sizeof(INDEX) - 1)) == 0);
+ INDEX* indices = (INDEX*)(indices_buf.buf + offset);
+ count = min(count, (GLsizei)((indices_buf.size - offset) / sizeof(INDEX)));
+ // Triangles must be indexed at offsets 0, 1, 2.
+ // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
+ if (count == 6 && indices[1] == indices[0] + 1 &&
+ indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) {
+ assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1);
+ // Fast path - since there is only a single quad, we only load per-vertex
+ // attribs once for all instances, as they won't change across instances
+ // or within an instance.
+ vertex_shader->load_attribs(v.attribs, indices[0], 0, 4);
+ draw_quad(4, colortex, layer, depthtex);
+ for (GLsizei instance = 1; instance < instancecount; instance++) {
+ vertex_shader->load_attribs(v.attribs, indices[0], instance, 0);
+ draw_quad(4, colortex, layer, depthtex);
+ }
+ } else {
+ for (GLsizei instance = 0; instance < instancecount; instance++) {
+ for (GLsizei i = 0; i + 3 <= count; i += 3) {
+ if (indices[i + 1] != indices[i] + 1 ||
+ indices[i + 2] != indices[i] + 2) {
+ continue;
+ }
+ if (i + 6 <= count && indices[i + 5] == indices[i] + 3) {
+ assert(indices[i + 3] == indices[i] + 2 &&
+ indices[i + 4] == indices[i] + 1);
+ vertex_shader->load_attribs(v.attribs, indices[i], instance, 4);
+ draw_quad(4, colortex, layer, depthtex);
+ i += 3;
+ } else {
+ vertex_shader->load_attribs(v.attribs, indices[i], instance, 3);
+ draw_quad(3, colortex, layer, depthtex);
+ }
+ }
+ }
+ }
+}
+
+extern "C" {
+
+void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
+ GLintptr offset, GLsizei instancecount) {
+ if (offset < 0 || count <= 0 || instancecount <= 0 || !vertex_shader ||
+ !fragment_shader) {
+ return;
+ }
+
+ Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER);
+ Texture& colortex = ctx->textures[fb.color_attachment];
+ if (!colortex.buf) {
+ return;
+ }
+ assert(!colortex.locked);
+ assert(colortex.internal_format == GL_RGBA8 ||
+ colortex.internal_format == GL_R8);
+ Texture& depthtex = ctx->textures[ctx->depthtest ? fb.depth_attachment : 0];
+ if (depthtex.buf) {
+ assert(depthtex.internal_format == GL_DEPTH_COMPONENT16);
+ assert(colortex.width == depthtex.width &&
+ colortex.height == depthtex.height);
+ assert(colortex.offset == depthtex.offset);
+ }
+
+ // debugf("current_vertex_array %d\n", ctx->current_vertex_array);
+ // debugf("indices size: %d\n", indices_buf.size);
+ VertexArray& v = ctx->vertex_arrays[ctx->current_vertex_array];
+ if (ctx->validate_vertex_array) {
+ ctx->validate_vertex_array = false;
+ v.validate();
+ }
+
+#ifdef PRINT_TIMINGS
+ uint64_t start = get_time_value();
+#endif
+
+ ctx->shaded_rows = 0;
+ ctx->shaded_pixels = 0;
+
+ vertex_shader->init_batch();
+
+ switch (type) {
+ case GL_UNSIGNED_SHORT:
+ assert(mode == GL_TRIANGLES);
+ draw_elements<uint16_t>(count, instancecount, offset, v, colortex,
+ fb.layer, depthtex);
+ break;
+ case GL_UNSIGNED_INT:
+ assert(mode == GL_TRIANGLES);
+ draw_elements<uint32_t>(count, instancecount, offset, v, colortex,
+ fb.layer, depthtex);
+ break;
+ case GL_NONE:
+ // Non-standard GL extension - if element type is GL_NONE, then we don't
+ // use any element buffer and behave as if DrawArrays was called instead.
+ for (GLsizei instance = 0; instance < instancecount; instance++) {
+ switch (mode) {
+ case GL_LINES:
+ for (GLsizei i = 0; i + 2 <= count; i += 2) {
+ vertex_shader->load_attribs(v.attribs, offset + i, instance, 2);
+ draw_quad(2, colortex, fb.layer, depthtex);
+ }
+ break;
+ case GL_TRIANGLES:
+ for (GLsizei i = 0; i + 3 <= count; i += 3) {
+ vertex_shader->load_attribs(v.attribs, offset + i, instance, 3);
+ draw_quad(3, colortex, fb.layer, depthtex);
+ }
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ }
+ break;
+ default:
+ assert(false);
+ break;
+ }
+
+ if (ctx->samples_passed_query) {
+ Query& q = ctx->queries[ctx->samples_passed_query];
+ q.value += ctx->shaded_pixels;
+ }
+
+#ifdef PRINT_TIMINGS
+ uint64_t end = get_time_value();
+ printf(
+ "%7.3fms draw(%s, %d): %d pixels in %d rows (avg %f pixels/row, "
+ "%fns/pixel)\n",
+ double(end - start) / (1000. * 1000.),
+ ctx->programs[ctx->current_program].impl->get_name(), instancecount,
+ ctx->shaded_pixels, ctx->shaded_rows,
+ double(ctx->shaded_pixels) / ctx->shaded_rows,
+ double(end - start) / max(ctx->shaded_pixels, 1));
+#endif
+}
+
+void Finish() {
+#ifdef PRINT_TIMINGS
+ printf("Finish\n");
+#endif
+}
+
+void MakeCurrent(Context* c) {
+ if (ctx == c) {
+ return;
+ }
+ ctx = c;
+ setup_program(ctx ? ctx->current_program : 0);
+}
+
+Context* CreateContext() { return new Context; }
+
+void ReferenceContext(Context* c) {
+ if (!c) {
+ return;
+ }
+ ++c->references;
+}
+
+void DestroyContext(Context* c) {
+ if (!c) {
+ return;
+ }
+ assert(c->references > 0);
+ --c->references;
+ if (c->references > 0) {
+ return;
+ }
+ if (ctx == c) {
+ MakeCurrent(nullptr);
+ }
+ delete c;
+}
+
+} // extern "C"
diff --git a/gfx/wr/swgl/src/gl_defs.h b/gfx/wr/swgl/src/gl_defs.h
new file mode 100644
index 0000000000..f5c2fb21d9
--- /dev/null
+++ b/gfx/wr/swgl/src/gl_defs.h
@@ -0,0 +1,193 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+typedef int8_t GLbyte;
+typedef uint8_t GLubyte;
+typedef int16_t GLshort;
+typedef uint16_t GLushort;
+typedef int32_t GLint;
+typedef uint32_t GLuint;
+typedef int64_t GLint64;
+typedef uint64_t GLuint64;
+
+typedef float GLfloat;
+typedef double GLdouble;
+
+typedef uint32_t GLenum;
+typedef uint8_t GLboolean;
+typedef uint32_t GLbitfield;
+
+typedef int32_t GLsizei;
+typedef size_t GLsizeiptr;
+typedef intptr_t GLintptr;
+
+#define GL_FALSE 0
+#define GL_TRUE 1
+
+#define GL_NONE 0
+
+#define GL_NO_ERROR 0
+
+#define GL_RGBA32F 0x8814
+#define GL_RGBA8 0x8058
+#define GL_R8 0x8229
+#define GL_R16 0x822A
+#define GL_RGBA32I 0x8D82
+#define GL_BGRA8 0x93A1
+#define GL_RG8 0x822B
+
+#define GL_BYTE 0x1400
+#define GL_UNSIGNED_BYTE 0x1401
+#define GL_SHORT 0x1402
+#define GL_UNSIGNED_SHORT 0x1403
+#define GL_INT 0x1404
+#define GL_UNSIGNED_INT 0x1405
+#define GL_FLOAT 0x1406
+
+#define GL_RED 0x1903
+#define GL_GREEN 0x1904
+#define GL_BLUE 0x1905
+#define GL_ALPHA 0x1906
+#define GL_RGB 0x1907
+#define GL_RGBA 0x1908
+#define GL_RGBA_INTEGER 0x8D99
+#define GL_BGRA 0x80E1
+#define GL_RG 0x8227
+
+#define GL_DEPTH_COMPONENT 0x1902
+#define GL_DEPTH_COMPONENT16 0x81A5
+#define GL_DEPTH_COMPONENT24 0x81A6
+#define GL_DEPTH_COMPONENT32 0x81A7
+
+#define GL_ARRAY_BUFFER 0x8892
+#define GL_ELEMENT_ARRAY_BUFFER 0x8893
+
+#define GL_READ_FRAMEBUFFER 0x8CA8
+#define GL_DRAW_FRAMEBUFFER 0x8CA9
+#define GL_FRAMEBUFFER 0x8D40
+#define GL_DRAW_FRAMEBUFFER_BINDING 0x8CA6
+#define GL_READ_FRAMEBUFFER_BINDING 0x8CAA
+#define GL_RENDERBUFFER 0x8D41
+#define GL_COLOR_ATTACHMENT0 0x8CE0
+#define GL_DEPTH_ATTACHMENT 0x8D00
+#define GL_STENCIL_ATTACHMENT 0x8D20
+#define GL_FRAMEBUFFER_COMPLETE 0x8CD5
+#define GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT 0x8CD6
+#define GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT 0x8CD7
+#define GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER 0x8CDB
+#define GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER 0x8CDC
+#define GL_FRAMEBUFFER_UNSUPPORTED 0x8CDD
+#define GL_COLOR_BUFFER_BIT 0x00004000
+#define GL_DEPTH_BUFFER_BIT 0x00000100
+#define GL_STENCIL_BUFFER_BIT 0x00000400
+
+#define GL_PIXEL_PACK_BUFFER 0x88EB
+#define GL_PIXEL_UNPACK_BUFFER 0x88EC
+#define GL_PIXEL_PACK_BUFFER_BINDING 0x88ED
+#define GL_PIXEL_UNPACK_BUFFER_BINDING 0x88EF
+#define GL_UNPACK_ROW_LENGTH 0x0CF2
+#define GL_UNPACK_ALIGNMENT 0x0CF5
+
+#define GL_QUERY_RESULT 0x8866
+#define GL_QUERY_RESULT_AVAILABLE 0x8867
+#define GL_TIME_ELAPSED 0x88BF
+#define GL_SAMPLES_PASSED 0x8914
+
+#define GL_NEAREST 0x2600
+#define GL_LINEAR 0x2601
+#define GL_NEAREST_MIPMAP_NEAREST 0x2700
+#define GL_NEAREST_MIPMAP_LINEAR 0x2702
+#define GL_LINEAR_MIPMAP_NEAREST 0x2701
+#define GL_LINEAR_MIPMAP_LINEAR 0x2703
+#define GL_TEXTURE_WRAP_S 0x2802
+#define GL_TEXTURE_WRAP_T 0x2803
+#define GL_TEXTURE_MAG_FILTER 0x2800
+#define GL_TEXTURE_MIN_FILTER 0x2801
+#define GL_CLAMP_TO_EDGE 0x812F
+#define GL_TEXTURE_2D 0x0DE1
+#define GL_TEXTURE_3D 0x806F
+#define GL_TEXTURE_2D_ARRAY 0x8C1A
+#define GL_TEXTURE_RECTANGLE 0x84F5
+#define GL_TEXTURE0 0x84C0
+#define GL_TEXTURE1 0x84C1
+#define GL_TEXTURE2 0x84C2
+#define GL_TEXTURE3 0x84C3
+#define GL_TEXTURE4 0x84C4
+#define GL_TEXTURE5 0x84C5
+#define GL_TEXTURE6 0x84C6
+#define GL_TEXTURE7 0x84C7
+#define GL_TEXTURE8 0x84C8
+#define GL_TEXTURE9 0x84C9
+#define GL_TEXTURE10 0x84CA
+#define GL_TEXTURE11 0x84CB
+#define GL_TEXTURE12 0x84CC
+#define GL_TEXTURE13 0x84CD
+#define GL_TEXTURE14 0x84CE
+#define GL_TEXTURE15 0x84CF
+#define GL_MAX_TEXTURE_UNITS 0x84E2
+#define GL_MAX_TEXTURE_IMAGE_UNITS 0x8872
+#define GL_MAX_TEXTURE_SIZE 0x0D33
+#define GL_MAX_ARRAY_TEXTURE_LAYERS 0x88FF
+
+#define GL_VERTEX_SHADER 0x8B31
+#define GL_FRAGMENT_SHADER 0x8B30
+
+#define GL_BLEND 0x0BE2
+#define GL_ZERO 0
+#define GL_ONE 1
+#define GL_SRC_COLOR 0x0300
+#define GL_ONE_MINUS_SRC_COLOR 0x0301
+#define GL_SRC_ALPHA 0x0302
+#define GL_ONE_MINUS_SRC_ALPHA 0x0303
+#define GL_DST_ALPHA 0x0304
+#define GL_ONE_MINUS_DST_ALPHA 0x0305
+#define GL_DST_COLOR 0x0306
+#define GL_ONE_MINUS_DST_COLOR 0x0307
+#define GL_CONSTANT_COLOR 0x8001
+#define GL_ONE_MINUS_CONSTANT_COLOR 0x8002
+#define GL_CONSTANT_ALPHA 0x8003
+#define GL_ONE_MINUS_CONSTANT_ALPHA 0x8004
+#define GL_SRC1_ALPHA 0x8589
+#define GL_SRC1_COLOR 0x88F9
+#define GL_ONE_MINUS_SRC1_COLOR 0x88FA
+#define GL_ONE_MINUS_SRC1_ALPHA 0x88FB
+
+#define GL_FUNC_ADD 0x8006
+
+#define GL_NEVER 0x0200
+#define GL_LESS 0x0201
+#define GL_EQUAL 0x0202
+#define GL_LEQUAL 0x0203
+#define GL_GREATER 0x0204
+#define GL_NOTEQUAL 0x0205
+#define GL_GEQUAL 0x0206
+#define GL_ALWAYS 0x0207
+#define GL_DEPTH_TEST 0x0B71
+#define GL_DEPTH_WRITEMASK 0x0B72
+
+#define GL_SCISSOR_TEST 0x0C11
+
+#define GL_VENDOR 0x1F00
+#define GL_RENDERER 0x1F01
+#define GL_VERSION 0x1F02
+#define GL_EXTENSIONS 0x1F03
+#define GL_NUM_EXTENSIONS 0x821D
+#define GL_MINOR_VERSION 0x821C
+#define GL_MAJOR_VERSION 0x821B
+
+#define GL_POINTS 0x0000
+#define GL_LINES 0x0001
+#define GL_LINE_LOOP 0x0002
+#define GL_LINE_STRIP 0x0003
+#define GL_TRIANGLES 0x0004
+#define GL_TRIANGLE_STRIP 0x0005
+#define GL_TRIANGLE_FAN 0x0006
+#define GL_QUADS 0x0007
+
+#define GL_UNSIGNED_INT_8_8_8_8_REV 0x8367
+
+#define GL_RGB_422_APPLE 0x8A1F
+#define GL_UNSIGNED_SHORT_8_8_APPLE 0x85BA
+#define GL_UNSIGNED_SHORT_8_8_REV_APPLE 0x85BB
+#define GL_RGB_RAW_422_APPLE 0x8A51
diff --git a/gfx/wr/swgl/src/glsl.h b/gfx/wr/swgl/src/glsl.h
new file mode 100644
index 0000000000..875561c8b3
--- /dev/null
+++ b/gfx/wr/swgl/src/glsl.h
@@ -0,0 +1,2669 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#define SI ALWAYS_INLINE static
+
+#include "vector_type.h"
+
+namespace glsl {
+
+enum TextureFormat { RGBA32F, RGBA32I, RGBA8, R8, RG8, R16, YUV422 };
+
+enum TextureFilter { NEAREST, LINEAR };
+
+struct samplerCommon {
+ uint32_t* buf = nullptr;
+ uint32_t stride = 0; // in units of BPP if < 4, or dwords if BPP >= 4
+ uint32_t height = 0;
+ uint32_t width = 0;
+ TextureFormat format = TextureFormat::RGBA8;
+};
+
+struct samplerDepth {
+ int depth = 0;
+ uint32_t height_stride = 0; // in units of BPP if < 4, or dwords if BPP >= 4
+};
+
+struct samplerFilter {
+ TextureFilter filter = TextureFilter::NEAREST;
+};
+
+struct sampler2DArray_impl : samplerCommon, samplerDepth, samplerFilter {};
+typedef sampler2DArray_impl* sampler2DArray;
+
+typedef struct sampler2DArrayR8_impl : sampler2DArray_impl{} * sampler2DArrayR8;
+typedef struct sampler2DArrayRG8_impl : sampler2DArray_impl{} *
+ sampler2DArrayRG8;
+typedef struct sampler2DArrayRGBA8_impl : sampler2DArray_impl{} *
+ sampler2DArrayRGBA8;
+typedef struct sampler2DArrayRGBA32F_impl : sampler2DArray_impl{} *
+ sampler2DArrayRGBA32F;
+
+struct sampler2D_impl : samplerCommon, samplerFilter {};
+typedef sampler2D_impl* sampler2D;
+
+typedef struct sampler2DR8_impl : sampler2D_impl{} * sampler2DR8;
+typedef struct sampler2DRG8_impl : sampler2D_impl{} * sampler2DRG8;
+typedef struct sampler2DRGBA8_impl : sampler2D_impl{} * sampler2DRGBA8;
+typedef struct sampler2DRGBA32F_impl : sampler2D_impl{} * sampler2DRGBA32F;
+
+struct isampler2D_impl : samplerCommon {};
+typedef isampler2D_impl* isampler2D;
+
+struct isampler2DRGBA32I_impl : isampler2D_impl {};
+typedef isampler2DRGBA32I_impl* isampler2DRGBA32I;
+
+struct sampler2DRect_impl : samplerCommon, samplerFilter {};
+typedef sampler2DRect_impl* sampler2DRect;
+
+#if USE_SSE2
+SI bool test_all(Bool cond) { return _mm_movemask_ps(cond) == 0xF; }
+SI bool test_any(Bool cond) { return _mm_movemask_ps(cond) != 0; }
+SI bool test_none(Bool cond) { return _mm_movemask_ps(cond) == 0; }
+#else
+SI bool test_all(Bool cond) {
+ return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0xFFFFFFFFU;
+}
+SI bool test_any(Bool cond) {
+ return bit_cast<uint32_t>(CONVERT(cond, U8)) != 0;
+}
+SI bool test_none(Bool cond) {
+ return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0;
+}
+#endif
+
+float make_float(float n) { return n; }
+
+float make_float(int32_t n) { return float(n); }
+
+float make_float(uint32_t n) { return float(n); }
+
+float make_float(bool n) { return float(n); }
+
+template <typename T>
+Float make_float(T v) {
+ return CONVERT(v, Float);
+}
+
+int32_t make_int(uint32_t n) { return n; }
+
+int32_t make_int(int32_t n) { return n; }
+
+int32_t make_int(float n) { return int32_t(n); }
+
+int32_t make_int(bool n) { return int32_t(n); }
+
+template <typename T>
+I32 make_int(T v) {
+ return CONVERT(v, I32);
+}
+
+uint32_t make_uint(uint32_t n) { return n; }
+
+uint32_t make_uint(int32_t n) { return n; }
+
+uint32_t make_uint(float n) { return uint32_t(n); }
+
+uint32_t make_uint(bool n) { return uint32_t(n); }
+
+template <typename T>
+U32 make_uint(T v) {
+ return CONVERT(v, U32);
+}
+
+template <typename T>
+T force_scalar(T n) {
+ return n;
+}
+
+float force_scalar(Float f) { return f[0]; }
+
+int32_t force_scalar(I32 i) { return i[0]; }
+
+struct vec4;
+struct ivec2;
+
+SI int32_t if_then_else(int32_t c, int32_t t, int32_t e) { return c ? t : e; }
+
+SI float if_then_else(int32_t c, float t, float e) { return c ? t : e; }
+
+SI Float if_then_else(I32 c, float t, float e) {
+ return bit_cast<Float>((c & bit_cast<I32>(Float(t))) |
+ (~c & bit_cast<I32>(Float(e))));
+}
+
+SI I32 if_then_else(I32 c, int32_t t, int32_t e) {
+ return (c & I32(t)) | (~c & I32(e));
+}
+
+SI U32 if_then_else(I32 c, U32 t, U32 e) {
+ return bit_cast<U32>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e)));
+}
+
+SI Float if_then_else(I32 c, Float t, Float e) {
+ return bit_cast<Float>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e)));
+}
+
+SI Float if_then_else(int32_t c, Float t, Float e) { return c ? t : e; }
+
+SI Bool if_then_else(I32 c, Bool t, Bool e) { return (c & t) | (~c & e); }
+
+SI Bool if_then_else(int32_t c, Bool t, Bool e) { return c ? t : e; }
+
+SI I16 if_then_else(I16 c, I16 t, I16 e) { return (c & t) | (~c & e); }
+
+template <typename T>
+SI void swap(T& a, T& b) {
+ T t(a);
+ a = b;
+ b = t;
+}
+
+SI int32_t min(int32_t a, int32_t b) { return a < b ? a : b; }
+SI int32_t max(int32_t a, int32_t b) { return a > b ? a : b; }
+
+SI int32_t clamp(int32_t a, int32_t minVal, int32_t maxVal) {
+ return min(max(a, minVal), maxVal);
+}
+
+SI float min(float a, float b) { return a < b ? a : b; }
+SI float max(float a, float b) { return a > b ? a : b; }
+
+SI float clamp(float a, float minVal, float maxVal) {
+ return min(max(a, minVal), maxVal);
+}
+
+SI Float min(Float a, Float b) {
+#if USE_SSE2
+ return _mm_min_ps(a, b);
+#elif USE_NEON
+ return vminq_f32(a, b);
+#else
+ return if_then_else(a < b, a, b);
+#endif
+}
+
+SI Float max(Float a, Float b) {
+#if USE_SSE2
+ return _mm_max_ps(a, b);
+#elif USE_NEON
+ return vmaxq_f32(a, b);
+#else
+ return if_then_else(a > b, a, b);
+#endif
+}
+
+SI Float clamp(Float a, Float minVal, Float maxVal) {
+ return min(max(a, minVal), maxVal);
+}
+
+#define sqrt __glsl_sqrt
+
+SI float sqrt(float x) { return sqrtf(x); }
+
+SI Float sqrt(Float v) {
+#if USE_SSE2
+ return _mm_sqrt_ps(v);
+#elif USE_NEON
+ Float e = vrsqrteq_f32(v);
+ e *= vrsqrtsq_f32(v, e * e);
+ e *= vrsqrtsq_f32(v, e * e);
+ return v * e;
+#else
+ return (Float){sqrtf(v.x), sqrtf(v.y), sqrtf(v.z), sqrtf(v.w)};
+#endif
+}
+
+SI float inversesqrt(float x) { return 1.0f / sqrtf(x); }
+
+SI Float inversesqrt(Float v) {
+#if USE_SSE2
+ return _mm_rsqrt_ps(v);
+#elif USE_NEON
+ Float e = vrsqrteq_f32(v);
+ return vrsqrtsq_f32(v, e * e) * e;
+#else
+ return 1.0f / sqrt(v);
+#endif
+}
+
+SI float step(float edge, float x) { return float(x >= edge); }
+
+SI Float step(Float edge, Float x) {
+ return if_then_else(x < edge, Float(0), Float(1));
+}
+
+/*
+enum RGBA {
+ R,
+ G,
+ B,
+ A
+};*/
+
+enum XYZW {
+ X = 0,
+ Y = 1,
+ Z = 2,
+ W = 3,
+ R = 0,
+ G = 1,
+ B = 2,
+ A = 3,
+};
+
+struct bvec2_scalar {
+ bool x;
+ bool y;
+
+ bvec2_scalar() : bvec2_scalar(false) {}
+ IMPLICIT constexpr bvec2_scalar(bool a) : x(a), y(a) {}
+ constexpr bvec2_scalar(bool x, bool y) : x(x), y(y) {}
+};
+
+struct bvec2 {
+ bvec2() : bvec2(0) {}
+ IMPLICIT bvec2(Bool a) : x(a), y(a) {}
+ bvec2(Bool x, Bool y) : x(x), y(y) {}
+ Bool& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ default:
+ UNREACHABLE;
+ }
+ }
+ Bool sel(XYZW c1) { return select(c1); }
+
+ bvec2 operator~() { return bvec2(~x, ~y); }
+
+ Bool x;
+ Bool y;
+};
+
+bvec2_scalar make_bvec2(bool n) { return bvec2_scalar{n, n}; }
+
+bvec2_scalar make_bvec2(bool x, bool y) { return bvec2_scalar{x, y}; }
+
+template <typename N>
+bvec2 make_bvec2(const N& n) {
+ return bvec2(n);
+}
+
+template <typename X, typename Y>
+bvec2 make_bvec2(const X& x, const Y& y) {
+ return bvec2(x, y);
+}
+
+struct vec4_scalar;
+
+struct vec2_scalar {
+ typedef struct vec2 vector_type;
+ typedef float element_type;
+
+ float x;
+ float y;
+
+ constexpr vec2_scalar() : vec2_scalar(0.0f) {}
+ IMPLICIT constexpr vec2_scalar(float a) : x(a), y(a) {}
+ IMPLICIT constexpr vec2_scalar(int a) : x(a), y(a) {}
+ constexpr vec2_scalar(float x, float y) : x(x), y(y) {}
+
+ float& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ default:
+ UNREACHABLE;
+ }
+ }
+ float& sel(XYZW c1) { return select(c1); }
+ vec2_scalar sel(XYZW c1, XYZW c2) {
+ return vec2_scalar(select(c1), select(c2));
+ }
+ vec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
+
+ friend bool operator==(const vec2_scalar& l, const vec2_scalar& r) {
+ return l.x == r.x && l.y == r.y;
+ }
+
+ friend bool operator!=(const vec2_scalar& l, const vec2_scalar& r) {
+ return l.x != r.x || l.y != r.y;
+ }
+
+ friend vec2_scalar operator*(float a, vec2_scalar b) {
+ return vec2_scalar(a * b.x, a * b.y);
+ }
+ friend vec2_scalar operator*(vec2_scalar a, float b) {
+ return vec2_scalar(a.x * b, a.y * b);
+ }
+ friend vec2_scalar operator*(vec2_scalar a, vec2_scalar b) {
+ return vec2_scalar(a.x * b.x, a.y * b.y);
+ }
+ friend vec2_scalar operator/(vec2_scalar a, vec2_scalar b) {
+ return vec2_scalar(a.x / b.x, a.y / b.y);
+ }
+
+ friend vec2_scalar operator-(vec2_scalar a, vec2_scalar b) {
+ return vec2_scalar(a.x - b.x, a.y - b.y);
+ }
+ friend vec2_scalar operator+(vec2_scalar a, vec2_scalar b) {
+ return vec2_scalar(a.x + b.x, a.y + b.y);
+ }
+ friend vec2_scalar operator+(vec2_scalar a, float b) {
+ return vec2_scalar(a.x + b, a.y + b);
+ }
+
+ vec2_scalar operator-() { return vec2_scalar(-x, -y); }
+
+ vec2_scalar operator*=(vec2_scalar a) {
+ x *= a.x;
+ y *= a.y;
+ return *this;
+ }
+
+ vec2_scalar operator/=(vec2_scalar a) {
+ x /= a.x;
+ y /= a.y;
+ return *this;
+ }
+
+ vec2_scalar operator+=(vec2_scalar a) {
+ x += a.x;
+ y += a.y;
+ return *this;
+ }
+
+ vec2_scalar operator-=(vec2_scalar a) {
+ x -= a.x;
+ y -= a.y;
+ return *this;
+ }
+};
+
+struct vec2_scalar_ref {
+ vec2_scalar_ref(float& x, float& y) : x(x), y(y) {}
+ float& x;
+ float& y;
+
+ float& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ default:
+ UNREACHABLE;
+ }
+ }
+ float& sel(XYZW c1) { return select(c1); }
+
+ vec2_scalar_ref& operator=(const vec2_scalar& a) {
+ x = a.x;
+ y = a.y;
+ return *this;
+ }
+ vec2_scalar_ref& operator*=(vec2_scalar a) {
+ x *= a.x;
+ y *= a.y;
+ return *this;
+ }
+ operator vec2_scalar() const { return vec2_scalar{x, y}; }
+};
+
+struct vec2 {
+ typedef struct vec2 vector_type;
+ typedef float element_type;
+
+ constexpr vec2() : vec2(Float(0.0f)) {}
+ IMPLICIT constexpr vec2(Float a) : x(a), y(a) {}
+ vec2(Float x, Float y) : x(x), y(y) {}
+ IMPLICIT constexpr vec2(vec2_scalar s) : x(s.x), y(s.y) {}
+ constexpr vec2(vec2_scalar s0, vec2_scalar s1, vec2_scalar s2, vec2_scalar s3)
+ : x(Float{s0.x, s1.x, s2.x, s3.x}), y(Float{s0.y, s1.y, s2.y, s3.y}) {}
+ explicit vec2(ivec2 a);
+ Float x;
+ Float y;
+
+ Float& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ default:
+ UNREACHABLE;
+ }
+ }
+ Float& sel(XYZW c1) { return select(c1); }
+ vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); }
+
+ vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
+
+ vec2 operator*=(Float a) {
+ x *= a;
+ y *= a;
+ return *this;
+ }
+ vec2 operator*=(vec2 a) {
+ x *= a.x;
+ y *= a.y;
+ return *this;
+ }
+
+ vec2 operator/=(Float a) {
+ x /= a;
+ y /= a;
+ return *this;
+ }
+ vec2 operator/=(vec2 a) {
+ x /= a.x;
+ y /= a.y;
+ return *this;
+ }
+
+ vec2 operator+=(vec2 a) {
+ x += a.x;
+ y += a.y;
+ return *this;
+ }
+ vec2 operator-=(vec2 a) {
+ x -= a.x;
+ y -= a.y;
+ return *this;
+ }
+ vec2 operator-=(Float a) {
+ x -= a;
+ y -= a;
+ return *this;
+ }
+
+ vec2 operator-() { return vec2(-x, -y); }
+
+ friend I32 operator==(const vec2& l, const vec2& r) {
+ return l.x == r.x && l.y == r.y;
+ }
+
+ friend I32 operator!=(const vec2& l, const vec2& r) {
+ return l.x != r.x || l.y != r.y;
+ }
+
+ friend vec2 operator*(vec2 a, Float b) { return vec2(a.x * b, a.y * b); }
+ friend vec2 operator*(vec2 a, vec2 b) { return vec2(a.x * b.x, a.y * b.y); }
+ friend vec2 operator*(Float a, vec2 b) { return vec2(a * b.x, a * b.y); }
+
+ friend vec2 operator/(vec2 a, vec2 b) { return vec2(a.x / b.x, a.y / b.y); }
+ friend vec2 operator/(vec2 a, Float b) { return vec2(a.x / b, a.y / b); }
+
+ friend vec2 operator-(vec2 a, vec2 b) { return vec2(a.x - b.x, a.y - b.y); }
+ friend vec2 operator-(vec2 a, Float b) { return vec2(a.x - b, a.y - b); }
+ friend vec2 operator-(Float a, vec2 b) { return vec2(a - b.x, a - b.y); }
+ friend vec2 operator+(vec2 a, vec2 b) { return vec2(a.x + b.x, a.y + b.y); }
+ friend vec2 operator+(vec2 a, Float b) { return vec2(a.x + b, a.y + b); }
+ friend vec2 operator+(Float a, vec2 b) { return vec2(a + b.x, a + b.y); }
+};
+
+vec2_scalar force_scalar(const vec2& v) {
+ return vec2_scalar{force_scalar(v.x), force_scalar(v.y)};
+}
+
+vec2_scalar make_vec2(float n) { return vec2_scalar{n, n}; }
+
+vec2_scalar make_vec2(float x, float y) { return vec2_scalar{x, y}; }
+
+vec2_scalar make_vec2(int32_t x, int32_t y) {
+ return vec2_scalar{float(x), float(y)};
+}
+
+template <typename N>
+vec2 make_vec2(const N& n) {
+ return vec2(n);
+}
+
+template <typename X, typename Y>
+vec2 make_vec2(const X& x, const Y& y) {
+ return vec2(x, y);
+}
+
+vec2 operator*(vec2_scalar a, Float b) { return vec2(a.x * b, a.y * b); }
+
+vec2 operator*(Float a, vec2_scalar b) { return vec2(a * b.x, a * b.y); }
+
+SI vec2 min(vec2 a, vec2 b) { return vec2(min(a.x, b.x), min(a.y, b.y)); }
+
+SI vec2_scalar min(vec2_scalar a, vec2_scalar b) {
+ return vec2_scalar{min(a.x, b.x), min(a.y, b.y)};
+}
+
+SI vec2 if_then_else(I32 c, vec2 t, vec2 e) {
+ return vec2(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y));
+}
+
+SI vec2 if_then_else(int32_t c, vec2 t, vec2 e) { return c ? t : e; }
+
+vec2 step(vec2 edge, vec2 x) {
+ return vec2(step(edge.x, x.x), step(edge.y, x.y));
+}
+
+vec2_scalar step(vec2_scalar edge, vec2_scalar x) {
+ return vec2_scalar(step(edge.x, x.x), step(edge.y, x.y));
+}
+
+vec2 max(vec2 a, vec2 b) { return vec2(max(a.x, b.x), max(a.y, b.y)); }
+vec2 max(vec2 a, Float b) { return vec2(max(a.x, b), max(a.y, b)); }
+
+SI vec2_scalar max(vec2_scalar a, vec2_scalar b) {
+ return vec2_scalar{max(a.x, b.x), max(a.y, b.y)};
+}
+SI vec2_scalar max(vec2_scalar a, float b) {
+ return vec2_scalar{max(a.x, b), max(a.y, b)};
+}
+
+Float length(vec2 a) { return sqrt(a.x * a.x + a.y * a.y); }
+
+float length(vec2_scalar a) { return hypotf(a.x, a.y); }
+
+SI Float distance(vec2 a, vec2 b) { return length(a - b); }
+
+SI vec2 normalize(vec2 a) { return a / length(a); }
+
+#define abs __glsl_abs
+
+int32_t abs(int32_t a) { return a < 0 ? -a : a; }
+
+float abs(float a) { return fabsf(a); }
+
+Float abs(Float v) {
+#if USE_NEON
+ return vabsq_f32(v);
+#else
+ return bit_cast<Float>(bit_cast<I32>(v) & bit_cast<I32>(0.0f - v));
+#endif
+}
+
+Float cast(U32 v) { return CONVERT((I32)v, Float); }
+Float cast(I32 v) { return CONVERT((I32)v, Float); }
+I32 cast(Float v) { return CONVERT(v, I32); }
+
+#define floor __glsl_floor
+
+float floor(float a) { return floorf(a); }
+
+Float floor(Float v) {
+ Float roundtrip = cast(cast(v));
+ return roundtrip - if_then_else(roundtrip > v, Float(1), Float(0));
+}
+
+vec2 floor(vec2 v) { return vec2(floor(v.x), floor(v.y)); }
+
+vec2_scalar floor(vec2_scalar v) {
+ return vec2_scalar{floorf(v.x), floorf(v.y)};
+}
+
+#define ceil __glsl_ceil
+
+float ceil(float a) { return ceilf(a); }
+
+Float ceil(Float v) {
+ Float roundtrip = cast(cast(v));
+ return roundtrip + if_then_else(roundtrip < v, Float(1), Float(0));
+}
+
+// Round to nearest even
+SI int32_t roundeven(float v, float scale) {
+#if USE_SSE2
+ return _mm_cvtss_si32(_mm_set_ss(v * scale));
+#else
+ return bit_cast<int32_t>(v * scale + float(0xC00000)) - 0x4B400000;
+#endif
+}
+
+SI I32 roundeven(Float v, Float scale) {
+#if USE_SSE2
+ return _mm_cvtps_epi32(v * scale);
+#else
+ // Magic number implementation of round-to-nearest-even
+ // see http://stereopsis.com/sree/fpu2006.html
+ return bit_cast<I32>(v * scale + Float(0xC00000)) - 0x4B400000;
+#endif
+}
+
+// Round towards zero
+SI int32_t roundzero(float v, float scale) { return int32_t(v * scale); }
+
+SI I32 roundzero(Float v, Float scale) { return cast(v * scale); }
+
+// Round whichever direction is fastest for positive numbers
+SI I32 roundfast(Float v, Float scale) {
+#if USE_SSE2
+ return _mm_cvtps_epi32(v * scale);
+#else
+ return cast(v * scale + 0.5f);
+#endif
+}
+
+template <typename T>
+SI auto round_pixel(T v) {
+ return roundfast(v, 255.0f);
+}
+
+#define round __glsl_round
+
+float round(float a) { return roundf(a); }
+
+Float round(Float v) { return floor(v + 0.5f); }
+
+float fract(float a) { return a - floor(a); }
+
+Float fract(Float v) { return v - floor(v); }
+
+vec2 fract(vec2 v) { return vec2(fract(v.x), fract(v.y)); }
+
+// X derivatives can be approximated by dFdx(x) = x[1] - x[0].
+// Y derivatives are not easily available since we operate in terms of X spans
+// only. To work around, assume dFdy(p.x) = dFdx(p.y), which only holds for
+// uniform scaling, and thus abs(dFdx(p.x)) + abs(dFdy(p.x)) = abs(dFdx(p.x)) +
+// abs(dFdx(p.y)) which mirrors abs(dFdx(p.y)) + abs(dFdy(p.y)) = abs(dFdx(p.y))
+// + abs(dFdx(p.x)).
+vec2 fwidth(vec2 p) {
+ Float d = abs(SHUFFLE(p.x, p.y, 1, 1, 5, 5) - SHUFFLE(p.x, p.y, 0, 0, 4, 4));
+ return vec2(d + d.zwxy);
+}
+
+// See
+// http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
+Float approx_log2(Float x) {
+ // e - 127 is a fair approximation of log2(x) in its own right...
+ Float e = cast(bit_cast<U32>(x)) * (1.0f / (1 << 23));
+
+ // ... but using the mantissa to refine its error is _much_ better.
+ Float m = bit_cast<Float>((bit_cast<U32>(x) & 0x007fffff) | 0x3f000000);
+ return e - 124.225514990f - 1.498030302f * m -
+ 1.725879990f / (0.3520887068f + m);
+}
+
+Float approx_pow2(Float x) {
+ Float f = fract(x);
+ return bit_cast<Float>(
+ roundfast(1.0f * (1 << 23), x + 121.274057500f - 1.490129070f * f +
+ 27.728023300f / (4.84252568f - f)));
+}
+
+#define pow __glsl_pow
+
+SI float pow(float x, float y) { return powf(x, y); }
+
+Float pow(Float x, Float y) {
+ return if_then_else((x == 0) | (x == 1), x, approx_pow2(approx_log2(x) * y));
+}
+
+#define exp __glsl_exp
+
+SI float exp(float x) { return expf(x); }
+
+Float exp(Float y) {
+ float l2e = 1.4426950408889634074f;
+ return approx_pow2(l2e * y);
+}
+
+#define exp2 __glsl_exp2
+
+SI float exp2(float x) { return exp2f(x); }
+
+Float exp2(Float x) { return approx_pow2(x); }
+
+#define log __glsl_log
+
+SI float log(float x) { return logf(x); }
+
+Float log(Float x) { return approx_log2(x) * 0.69314718f; }
+
+#define log2 __glsl_log2
+
+SI float log2(float x) { return log2f(x); }
+
+Float log2(Float x) { return approx_log2(x); }
+
+struct ivec4;
+
+struct ivec2_scalar {
+ typedef int32_t element_type;
+
+ int32_t x;
+ int32_t y;
+
+ ivec2_scalar() : ivec2_scalar(0) {}
+ IMPLICIT constexpr ivec2_scalar(int32_t a) : x(a), y(a) {}
+ constexpr ivec2_scalar(int32_t x, int32_t y) : x(x), y(y) {}
+
+ int32_t& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ default:
+ UNREACHABLE;
+ }
+ }
+ int32_t& sel(XYZW c1) { return select(c1); }
+ ivec2_scalar sel(XYZW c1, XYZW c2) {
+ return ivec2_scalar{select(c1), select(c2)};
+ }
+
+ ivec2_scalar operator-() const { return ivec2_scalar{-x, -y}; }
+
+ ivec2_scalar& operator+=(ivec2_scalar a) {
+ x += a.x;
+ y += a.y;
+ return *this;
+ }
+ ivec2_scalar& operator+=(int n) {
+ x += n;
+ y += n;
+ return *this;
+ }
+
+ ivec2_scalar& operator>>=(int shift) {
+ x >>= shift;
+ y >>= shift;
+ return *this;
+ }
+
+ friend ivec2_scalar operator&(ivec2_scalar a, int b) {
+ return ivec2_scalar{a.x & b, a.y & b};
+ }
+
+ friend ivec2_scalar operator+(ivec2_scalar a, ivec2_scalar b) {
+ return ivec2_scalar{a.x + b.x, a.y + b.y};
+ }
+
+ friend ivec2_scalar operator-(ivec2_scalar a, ivec2_scalar b) {
+ return ivec2_scalar{a.x - b.x, a.y - b.y};
+ }
+
+ friend bool operator==(const ivec2_scalar& l, const ivec2_scalar& r) {
+ return l.x == r.x && l.y == r.y;
+ }
+};
+
+struct ivec2 {
+ typedef int32_t element_type;
+
+ ivec2() : ivec2(I32(0)) {}
+ IMPLICIT ivec2(I32 a) : x(a), y(a) {}
+ ivec2(I32 x, I32 y) : x(x), y(y) {}
+ IMPLICIT ivec2(vec2 a) : x(cast(a.x)), y(cast(a.y)) {}
+ ivec2(U32 x, U32 y) : x(CONVERT(x, I32)), y(CONVERT(y, I32)) {}
+ IMPLICIT constexpr ivec2(ivec2_scalar s) : x(s.x), y(s.y) {}
+ constexpr ivec2(ivec2_scalar s0, ivec2_scalar s1, ivec2_scalar s2,
+ ivec2_scalar s3)
+ : x(I32{s0.x, s1.x, s2.x, s3.x}), y(I32{s0.y, s1.y, s2.y, s3.y}) {}
+ I32 x;
+ I32 y;
+
+ I32& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ default:
+ UNREACHABLE;
+ }
+ }
+ I32& sel(XYZW c1) { return select(c1); }
+
+ ivec2 sel(XYZW c1, XYZW c2) { return ivec2(select(c1), select(c2)); }
+
+ ivec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
+
+ ivec2& operator*=(I32 a) {
+ x *= a;
+ y *= a;
+ return *this;
+ }
+ ivec2& operator+=(ivec2 a) {
+ x += a.x;
+ y += a.y;
+ return *this;
+ }
+ ivec2& operator>>=(int shift) {
+ x >>= shift;
+ y >>= shift;
+ return *this;
+ }
+
+ friend ivec2 operator*(ivec2 a, I32 b) { return ivec2(a.x * b, a.y * b); }
+ friend ivec2 operator&(ivec2 a, ivec2 b) {
+ return ivec2(a.x & b.x, a.y & b.y);
+ }
+ friend ivec2 operator&(ivec2 a, I32 b) { return ivec2(a.x & b, a.y & b); }
+ friend ivec2 operator+(ivec2 a, ivec2 b) {
+ return ivec2(a.x + b.x, a.y + b.y);
+ }
+};
+
+vec2::vec2(ivec2 a) : x(cast(a.x)), y(cast(a.y)) {}
+
+ivec2_scalar make_ivec2(int32_t n) { return ivec2_scalar{n, n}; }
+
+ivec2_scalar make_ivec2(uint32_t n) {
+ return ivec2_scalar{int32_t(n), int32_t(n)};
+}
+
+ivec2_scalar make_ivec2(int32_t x, int32_t y) { return ivec2_scalar{x, y}; }
+
+ivec2_scalar make_ivec2(uint32_t x, uint32_t y) {
+ return ivec2_scalar{int32_t(x), int32_t(y)};
+}
+
+vec2_scalar make_vec2(const ivec2_scalar& v) {
+ return vec2_scalar{float(v.x), float(v.y)};
+}
+
+ivec2_scalar make_ivec2(const vec2_scalar& v) {
+ return ivec2_scalar{int32_t(v.x), int32_t(v.y)};
+}
+
+template <typename N>
+ivec2 make_ivec2(const N& n) {
+ return ivec2(n);
+}
+
+template <typename X, typename Y>
+ivec2 make_ivec2(const X& x, const Y& y) {
+ return ivec2(x, y);
+}
+
+ivec2_scalar force_scalar(const ivec2& v) {
+ return ivec2_scalar{force_scalar(v.x), force_scalar(v.y)};
+}
+
+struct ivec3_scalar {
+ int32_t x;
+ int32_t y;
+ int32_t z;
+
+ ivec3_scalar() : ivec3_scalar(0) {}
+ IMPLICIT constexpr ivec3_scalar(int32_t a) : x(a), y(a), z(a) {}
+ constexpr ivec3_scalar(int32_t x, int32_t y, int32_t z) : x(x), y(y), z(z) {}
+
+ int32_t& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ default:
+ UNREACHABLE;
+ }
+ }
+ int32_t& sel(XYZW c1) { return select(c1); }
+ ivec2_scalar sel(XYZW c1, XYZW c2) {
+ return ivec2_scalar{select(c1), select(c2)};
+ }
+};
+
+struct ivec3 {
+ ivec3() : ivec3(0) {}
+ IMPLICIT ivec3(I32 a) : x(a), y(a), z(a) {}
+ ivec3(I32 x, I32 y, I32 z) : x(x), y(y), z(z) {}
+ ivec3(ivec2 a, I32 b) : x(a.x), y(a.y), z(b) {}
+ ivec3(vec2 a, Float b) : x(cast(a.x)), y(cast(a.y)), z(cast(b)) {}
+ I32 x;
+ I32 y;
+ I32 z;
+
+ friend ivec3 operator+(ivec3 a, ivec3 b) {
+ return ivec3(a.x + b.x, a.y + b.y, a.z + b.z);
+ }
+};
+
+vec2_scalar make_vec2(ivec3_scalar s) {
+ return vec2_scalar{float(s.x), float(s.y)};
+}
+
+ivec3_scalar make_ivec3(int32_t n) { return ivec3_scalar{n, n, n}; }
+
+ivec3_scalar make_ivec3(const ivec2_scalar& v, int32_t z) {
+ return ivec3_scalar{v.x, v.y, z};
+}
+
+ivec3_scalar make_ivec3(int32_t x, int32_t y, int32_t z) {
+ return ivec3_scalar{x, y, z};
+}
+
+template <typename N>
+ivec3 make_ivec3(const N& n) {
+ return ivec3(n);
+}
+
+template <typename X, typename Y>
+ivec3 make_ivec3(const X& x, const Y& y) {
+ return ivec3(x, y);
+}
+
+template <typename X, typename Y, typename Z>
+ivec3 make_ivec3(const X& x, const Y& y, const Z& z) {
+ return ivec3(x, y, z);
+}
+
+struct ivec4_scalar {
+ typedef int32_t element_type;
+
+ int32_t x;
+ int32_t y;
+ int32_t z;
+ int32_t w;
+
+ ivec4_scalar() : ivec4_scalar(0) {}
+ IMPLICIT constexpr ivec4_scalar(int32_t a) : x(a), y(a), z(a), w(a) {}
+ constexpr ivec4_scalar(int32_t x, int32_t y, int32_t z, int32_t w)
+ : x(x), y(y), z(z), w(w) {}
+
+ int32_t& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ case W:
+ return w;
+ default:
+ UNREACHABLE;
+ }
+ }
+ int32_t& sel(XYZW c1) { return select(c1); }
+ ivec2_scalar sel(XYZW c1, XYZW c2) {
+ return ivec2_scalar{select(c1), select(c2)};
+ }
+
+ friend ivec4_scalar operator&(int32_t a, ivec4_scalar b) {
+ return ivec4_scalar{a & b.x, a & b.y, a & b.z, a & b.w};
+ }
+};
+
+struct ivec4 {
+ typedef int32_t element_type;
+
+ ivec4() : ivec4(I32(0)) {}
+ IMPLICIT ivec4(I32 a) : x(a), y(a), z(a), w(a) {}
+ ivec4(I32 x, I32 y, I32 z, I32 w) : x(x), y(y), z(z), w(w) {}
+ ivec4(ivec2 a, I32 b, I32 c) : x(a.x), y(a.y), z(b), w(c) {}
+ IMPLICIT constexpr ivec4(ivec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
+ constexpr ivec4(ivec4_scalar s0, ivec4_scalar s1, ivec4_scalar s2,
+ ivec4_scalar s3)
+ : x(I32{s0.x, s1.x, s2.x, s3.x}),
+ y(I32{s0.y, s1.y, s2.y, s3.y}),
+ z(I32{s0.z, s1.z, s2.z, s3.z}),
+ w(I32{s0.w, s1.w, s2.w, s3.w}) {}
+
+ I32& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ case W:
+ return w;
+ default:
+ UNREACHABLE;
+ }
+ }
+ I32 sel(XYZW c1) { return select(c1); }
+
+ ivec2 sel(XYZW c1, XYZW c2) { return ivec2(select(c1), select(c2)); }
+
+ ivec3 sel(XYZW c1, XYZW c2, XYZW c3) {
+ return ivec3(select(c1), select(c2), select(c3));
+ }
+
+ friend ivec4 operator&(I32 a, ivec4 b) {
+ return ivec4(a & b.x, a & b.y, a & b.z, a & b.w);
+ }
+
+ I32 x;
+ I32 y;
+ I32 z;
+ I32 w;
+};
+
+ivec4_scalar force_scalar(const ivec4& v) {
+ return ivec4_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z),
+ force_scalar(v.w)};
+}
+
+ivec4_scalar make_ivec4(int32_t n) { return ivec4_scalar{n, n, n, n}; }
+
+ivec4_scalar make_ivec4(const ivec2_scalar& xy, int32_t z, int32_t w) {
+ return ivec4_scalar{xy.x, xy.y, z, w};
+}
+
+ivec4_scalar make_ivec4(int32_t x, int32_t y, int32_t z, int32_t w) {
+ return ivec4_scalar{x, y, z, w};
+}
+
+template <typename N>
+ivec4 make_ivec4(const N& n) {
+ return ivec4(n);
+}
+
+template <typename X, typename Y, typename Z>
+ivec4 make_ivec4(const X& x, const Y& y, const Z& z) {
+ return ivec4(x, y, z);
+}
+
+template <typename X, typename Y, typename Z, typename W>
+ivec4 make_ivec4(const X& x, const Y& y, const Z& z, const W& w) {
+ return ivec4(x, y, z, w);
+}
+
+SI ivec2 if_then_else(I32 c, ivec2 t, ivec2 e) {
+ return ivec2(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y));
+}
+
+SI ivec2 if_then_else(int32_t c, ivec2 t, ivec2 e) { return c ? t : e; }
+
+SI ivec4 if_then_else(I32 c, ivec4 t, ivec4 e) {
+ return ivec4(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y),
+ if_then_else(c, t.z, e.z), if_then_else(c, t.w, e.w));
+}
+
+SI ivec4 if_then_else(int32_t c, ivec4 t, ivec4 e) { return c ? t : e; }
+
+ivec4 operator&(I32 a, ivec4_scalar b) {
+ return ivec4(a & b.x, a & b.y, a & b.z, a & b.w);
+}
+
+struct bvec3_scalar {
+ bool x;
+ bool y;
+ bool z;
+
+ bvec3_scalar() : bvec3_scalar(false) {}
+ IMPLICIT constexpr bvec3_scalar(bool a) : x(a), y(a), z(a) {}
+ constexpr bvec3_scalar(bool x, bool y, bool z) : x(x), y(y), z(z) {}
+};
+
+struct bvec3 {
+ bvec3() : bvec3(0) {}
+ IMPLICIT bvec3(Bool a) : x(a), y(a), z(a) {}
+ bvec3(Bool x, Bool y, Bool z) : x(x), y(y), z(z) {}
+ Bool& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ default:
+ UNREACHABLE;
+ }
+ }
+ Bool sel(XYZW c1) { return select(c1); }
+
+ Bool x;
+ Bool y;
+ Bool z;
+};
+
+bvec3_scalar make_bvec3(bool n) { return bvec3_scalar{n, n, n}; }
+
+struct bvec4_scalar {
+ bool x;
+ bool y;
+ bool z;
+ bool w;
+
+ bvec4_scalar() : bvec4_scalar(false) {}
+ IMPLICIT constexpr bvec4_scalar(bool a) : x(a), y(a), z(a), w(a) {}
+ constexpr bvec4_scalar(bool x, bool y, bool z, bool w)
+ : x(x), y(y), z(z), w(w) {}
+};
+
+struct bvec4 {
+ bvec4() : bvec4(0) {}
+ IMPLICIT bvec4(Bool a) : x(a), y(a), z(a), w(a) {}
+ bvec4(Bool x, Bool y, Bool z, Bool w) : x(x), y(y), z(z), w(w) {}
+ bvec4(bvec2 x, bvec2 y) : x(x.x), y(x.y), z(y.x), w(y.y) {}
+ Bool& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ case W:
+ return w;
+ }
+ }
+ Bool sel(XYZW c1) { return select(c1); }
+
+ Bool x;
+ Bool y;
+ Bool z;
+ Bool w;
+};
+
+bvec4_scalar make_bvec4(bool n) { return bvec4_scalar{n, n, n, n}; }
+
+bvec4_scalar make_bvec4(bool x, bool y, bool z, bool w) {
+ return bvec4_scalar{x, y, z, w};
+}
+
+template <typename N>
+bvec4 make_bvec4(const N& n) {
+ return bvec4(n);
+}
+
+template <typename X, typename Y>
+bvec4 make_bvec4(const X& x, const Y& y) {
+ return bvec4(x, y);
+}
+
+template <typename X, typename Y, typename Z, typename W>
+bvec4 make_bvec4(const X& x, const Y& y, const Z& z, const W& w) {
+ return bvec4(x, y, z, w);
+}
+
+struct vec2_ref {
+ vec2_ref(Float& x, Float& y) : x(x), y(y) {}
+ Float& x;
+ Float& y;
+
+ Float& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ default:
+ UNREACHABLE;
+ }
+ }
+ Float& sel(XYZW c1) { return select(c1); }
+
+ vec2_ref& operator=(const vec2& a) {
+ x = a.x;
+ y = a.y;
+ return *this;
+ }
+
+ vec2_ref& operator/=(Float a) {
+ x /= a;
+ y /= a;
+ return *this;
+ }
+
+ vec2_ref& operator/=(vec2 a) {
+ x /= a.x;
+ y /= a.y;
+ return *this;
+ }
+
+ vec2_ref& operator+=(vec2 a) {
+ x += a.x;
+ y += a.y;
+ return *this;
+ }
+ vec2_ref& operator-=(vec2 a) {
+ x -= a.x;
+ y -= a.y;
+ return *this;
+ }
+ vec2_ref& operator*=(vec2 a) {
+ x *= a.x;
+ y *= a.y;
+ return *this;
+ }
+};
+
+struct vec3_scalar {
+ typedef struct vec3 vector_type;
+ typedef float element_type;
+
+ float x;
+ float y;
+ float z;
+
+ constexpr vec3_scalar() : vec3_scalar(0.0f) {}
+ IMPLICIT constexpr vec3_scalar(float a) : x(a), y(a), z(a) {}
+ constexpr vec3_scalar(float x, float y, float z) : x(x), y(y), z(z) {}
+
+ float& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ default:
+ UNREACHABLE;
+ }
+ }
+ float& sel(XYZW c1) { return select(c1); }
+ vec2_scalar sel(XYZW c1, XYZW c2) {
+ return vec2_scalar(select(c1), select(c2));
+ }
+ vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3) {
+ return vec3_scalar(select(c1), select(c2), select(c3));
+ }
+ vec2_scalar_ref lsel(XYZW c1, XYZW c2) {
+ return vec2_scalar_ref(select(c1), select(c2));
+ }
+
+ friend vec3_scalar operator*(vec3_scalar a, vec3_scalar b) {
+ return vec3_scalar{a.x * b.x, a.y * b.y, a.z * b.z};
+ }
+ friend vec3_scalar operator*(vec3_scalar a, float b) {
+ return vec3_scalar{a.x * b, a.y * b, a.z * b};
+ }
+
+ friend vec3_scalar operator-(vec3_scalar a, vec3_scalar b) {
+ return vec3_scalar{a.x - b.x, a.y - b.y, a.z - b.z};
+ }
+ friend vec3_scalar operator+(vec3_scalar a, vec3_scalar b) {
+ return vec3_scalar{a.x + b.x, a.y + b.y, a.z + b.z};
+ }
+
+ friend vec3_scalar operator/(vec3_scalar a, float b) {
+ return vec3_scalar{a.x / b, a.y / b, a.z / b};
+ }
+
+ vec3_scalar operator+=(vec3_scalar a) {
+ x += a.x;
+ y += a.y;
+ z += a.z;
+ return *this;
+ }
+
+ friend bool operator==(const vec3_scalar& l, const vec3_scalar& r) {
+ return l.x == r.x && l.y == r.y && l.z == r.z;
+ }
+};
+
+struct vec3_scalar_ref {
+ vec3_scalar_ref(float& x, float& y, float& z) : x(x), y(y), z(z) {}
+ float& x;
+ float& y;
+ float& z;
+
+ float& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ default:
+ UNREACHABLE;
+ }
+ }
+ float& sel(XYZW c1) { return select(c1); }
+
+ vec3_scalar_ref& operator=(const vec3_scalar& a) {
+ x = a.x;
+ y = a.y;
+ z = a.z;
+ return *this;
+ }
+
+ operator vec3_scalar() const { return vec3_scalar{x, y, z}; }
+};
+
+struct vec3 {
+ typedef struct vec3 vector_type;
+ typedef float element_type;
+
+ constexpr vec3() : vec3(Float(0.0f)) {}
+ IMPLICIT constexpr vec3(Float a) : x(a), y(a), z(a) {}
+ constexpr vec3(Float x, Float y, Float z) : x(x), y(y), z(z) {}
+ vec3(vec2 a, Float z) : x(a.x), y(a.y), z(z) {}
+ IMPLICIT constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {}
+ constexpr vec3(vec3_scalar s0, vec3_scalar s1, vec3_scalar s2, vec3_scalar s3)
+ : x(Float{s0.x, s1.x, s2.x, s3.x}),
+ y(Float{s0.y, s1.y, s2.y, s3.y}),
+ z(Float{s0.z, s1.z, s2.z, s3.z}) {}
+ Float x;
+ Float y;
+ Float z;
+
+ Float& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ default:
+ UNREACHABLE;
+ }
+ }
+ Float& sel(XYZW c1) { return select(c1); }
+
+ vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); }
+
+ vec3 sel(XYZW c1, XYZW c2, XYZW c3) {
+ return vec3(select(c1), select(c2), select(c3));
+ }
+
+ vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); }
+
+ friend vec3 operator*(vec3 a, Float b) {
+ return vec3(a.x * b, a.y * b, a.z * b);
+ }
+ friend vec3 operator*(vec3 a, vec3 b) {
+ return vec3(a.x * b.x, a.y * b.y, a.z * b.z);
+ }
+ friend vec3 operator*(Float a, vec3 b) {
+ return vec3(a * b.x, a * b.y, a * b.z);
+ }
+
+ friend vec3 operator/(vec3 a, Float b) {
+ return vec3(a.x / b, a.y / b, a.z / b);
+ }
+
+ friend I32 operator==(const vec3& l, const vec3& r) {
+ return l.x == r.x && l.y == r.y && l.z == r.z;
+ }
+
+ friend vec3 operator-(vec3 a, Float b) {
+ return vec3(a.x - b, a.y - b, a.z - b);
+ }
+ friend vec3 operator-(vec3 a, vec3 b) {
+ return vec3(a.x - b.x, a.y - b.y, a.z - b.z);
+ }
+ friend vec3 operator+(vec3 a, Float b) {
+ return vec3(a.x + b, a.y + b, a.z + b);
+ }
+ friend vec3 operator+(vec3 a, vec3 b) {
+ return vec3(a.x + b.x, a.y + b.y, a.z + b.z);
+ }
+
+ vec3 operator+=(vec3_scalar a) {
+ x += a.x;
+ y += a.y;
+ z += a.z;
+ return *this;
+ }
+ vec3& operator+=(vec3 a) {
+ x += a.x;
+ y += a.y;
+ z += a.z;
+ return *this;
+ }
+};
+
+vec3_scalar force_scalar(const vec3& v) {
+ return vec3_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z)};
+}
+
+vec3_scalar make_vec3(float n) { return vec3_scalar{n, n, n}; }
+
+vec3_scalar make_vec3(const vec2_scalar& v, float z) {
+ return vec3_scalar{v.x, v.y, z};
+}
+
+vec3_scalar make_vec3(float x, float y, float z) {
+ return vec3_scalar{x, y, z};
+}
+
+vec3_scalar make_vec3(int32_t x, int32_t y, float z) {
+ return vec3_scalar{float(x), float(y), z};
+}
+
+template <typename N>
+vec3 make_vec3(const N& n) {
+ return vec3(n);
+}
+
+template <typename X, typename Y>
+vec3 make_vec3(const X& x, const Y& y) {
+ return vec3(x, y);
+}
+
+template <typename X, typename Y, typename Z>
+vec3 make_vec3(const X& x, const Y& y, const Z& z) {
+ return vec3(x, y, z);
+}
+
+SI vec3 if_then_else(I32 c, vec3 t, vec3 e) {
+ return vec3(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y),
+ if_then_else(c, t.z, e.z));
+}
+
+SI vec3 if_then_else(int32_t c, vec3 t, vec3 e) { return c ? t : e; }
+
+SI vec3 if_then_else(ivec3 c, vec3 t, vec3 e) {
+ return vec3(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y),
+ if_then_else(c.z, t.z, e.z));
+}
+
+vec3 step(vec3 edge, vec3 x) {
+ return vec3(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z));
+}
+
+SI vec3 min(vec3 a, vec3 b) {
+ return vec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+}
+SI vec3 max(vec3 a, vec3 b) {
+ return vec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+}
+
+SI vec3_scalar max(vec3_scalar a, vec3_scalar b) {
+ return vec3_scalar{max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
+}
+
+vec3 pow(vec3 x, vec3 y) {
+ return vec3(pow(x.x, y.x), pow(x.y, y.y), pow(x.z, y.z));
+}
+
+struct vec3_ref {
+ vec3_ref(Float& x, Float& y, Float& z) : x(x), y(y), z(z) {}
+ Float& x;
+ Float& y;
+ Float& z;
+ vec3_ref& operator=(const vec3& a) {
+ x = a.x;
+ y = a.y;
+ z = a.z;
+ return *this;
+ }
+
+ vec3_ref& operator/=(Float a) {
+ x /= a;
+ y /= a;
+ z /= a;
+ return *this;
+ }
+
+ vec3_ref& operator*=(Float a) {
+ x *= a;
+ y *= a;
+ z *= a;
+ return *this;
+ }
+};
+
+struct vec4_scalar {
+ typedef struct vec4 vector_type;
+ typedef float element_type;
+
+ float x;
+ float y;
+ float z;
+ float w;
+
+ constexpr vec4_scalar() : vec4_scalar(0.0f) {}
+ IMPLICIT constexpr vec4_scalar(float a) : x(a), y(a), z(a), w(a) {}
+ constexpr vec4_scalar(float x, float y, float z, float w)
+ : x(x), y(y), z(z), w(w) {}
+ vec4_scalar(vec3_scalar xyz, float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+
+ ALWAYS_INLINE float& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ case W:
+ return w;
+ default:
+ UNREACHABLE;
+ }
+ }
+ float& sel(XYZW c1) { return select(c1); }
+ vec2_scalar sel(XYZW c1, XYZW c2) {
+ return vec2_scalar{select(c1), select(c2)};
+ }
+ vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3) {
+ return vec3_scalar{select(c1), select(c2), select(c3)};
+ }
+ vec2_scalar_ref lsel(XYZW c1, XYZW c2) {
+ return vec2_scalar_ref(select(c1), select(c2));
+ }
+ vec3_scalar_ref lsel(XYZW c1, XYZW c2, XYZW c3) {
+ return vec3_scalar_ref(select(c1), select(c2), select(c3));
+ }
+
+ friend vec4_scalar operator*(vec4_scalar a, vec4_scalar b) {
+ return vec4_scalar{a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w};
+ }
+ friend vec4_scalar operator*(vec4_scalar a, float b) {
+ return vec4_scalar{a.x * b, a.y * b, a.z * b, a.w * b};
+ }
+ vec4_scalar& operator*=(float a) {
+ x *= a;
+ y *= a;
+ z *= a;
+ w *= a;
+ return *this;
+ }
+
+ friend vec4_scalar operator-(vec4_scalar a, vec4_scalar b) {
+ return vec4_scalar{a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w};
+ }
+ friend vec4_scalar operator+(vec4_scalar a, vec4_scalar b) {
+ return vec4_scalar{a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w};
+ }
+
+ friend vec4_scalar operator/(vec4_scalar a, vec4_scalar b) {
+ return vec4_scalar{a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w};
+ }
+
+ vec4_scalar& operator+=(vec4_scalar a) {
+ x += a.x;
+ y += a.y;
+ z += a.z;
+ w += a.w;
+ return *this;
+ }
+
+ vec4_scalar& operator/=(vec4_scalar a) {
+ x /= a.x;
+ y /= a.y;
+ z /= a.z;
+ w /= a.w;
+ return *this;
+ }
+
+ friend bool operator==(const vec4_scalar& l, const vec4_scalar& r) {
+ return l.x == r.x && l.y == r.y && l.z == r.z && l.w == r.w;
+ }
+
+ friend bool operator!=(const vec4_scalar& l, const vec4_scalar& r) {
+ return l.x != r.x || l.y != r.y || l.z != r.z || l.w != r.w;
+ }
+};
+
+vec4_scalar vec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+ return vec4_scalar{select(c1), select(c2), select(c3), select(c4)};
+}
+
+struct vec4 {
+ typedef struct vec4 vector_type;
+ typedef float element_type;
+
+ constexpr vec4() : vec4(Float(0.0f)) {}
+ IMPLICIT constexpr vec4(Float a) : x(a), y(a), z(a), w(a) {}
+ vec4(Float x, Float y, Float z, Float w) : x(x), y(y), z(z), w(w) {}
+ vec4(vec3 xyz, Float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+ vec4(vec2 xy, vec2 zw) : x(xy.x), y(xy.y), z(zw.x), w(zw.y) {}
+ vec4(vec2 xy, Float z, Float w) : x(xy.x), y(xy.y), z(z), w(w) {}
+ vec4(Float x, Float y, vec2 zw) : x(x), y(y), z(zw.x), w(zw.y) {}
+ IMPLICIT constexpr vec4(vec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
+ constexpr vec4(vec4_scalar s0, vec4_scalar s1, vec4_scalar s2, vec4_scalar s3)
+ : x(Float{s0.x, s1.x, s2.x, s3.x}),
+ y(Float{s0.y, s1.y, s2.y, s3.y}),
+ z(Float{s0.z, s1.z, s2.z, s3.z}),
+ w(Float{s0.w, s1.w, s2.w, s3.w}) {}
+ Float& select(XYZW c) {
+ switch (c) {
+ case X:
+ return x;
+ case Y:
+ return y;
+ case Z:
+ return z;
+ case W:
+ return w;
+ default:
+ UNREACHABLE;
+ }
+ }
+ Float& sel(XYZW c1) { return select(c1); }
+
+ vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); }
+
+ vec3 sel(XYZW c1, XYZW c2, XYZW c3) {
+ return vec3(select(c1), select(c2), select(c3));
+ }
+ vec3_ref lsel(XYZW c1, XYZW c2, XYZW c3) {
+ return vec3_ref(select(c1), select(c2), select(c3));
+ }
+
+ vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); }
+
+ Float& operator[](int index) {
+ switch (index) {
+ case 0:
+ return x;
+ case 1:
+ return y;
+ case 2:
+ return z;
+ case 3:
+ return w;
+ default:
+ UNREACHABLE;
+ }
+ }
+
+ // glsl supports non-const indexing of vecs.
+ // hlsl doesn't. The code it generates is probably not wonderful.
+ Float operator[](I32 index) {
+ float sel_x = 0;
+ switch (index.x) {
+ case 0:
+ sel_x = x.x;
+ break;
+ case 1:
+ sel_x = y.x;
+ break;
+ case 2:
+ sel_x = z.x;
+ break;
+ case 3:
+ sel_x = w.x;
+ break;
+ }
+ float sel_y = 0;
+ switch (index.y) {
+ case 0:
+ sel_y = x.y;
+ break;
+ case 1:
+ sel_y = y.y;
+ break;
+ case 2:
+ sel_y = z.y;
+ break;
+ case 3:
+ sel_y = w.y;
+ break;
+ }
+ float sel_z = 0;
+ switch (index.z) {
+ case 0:
+ sel_z = x.z;
+ break;
+ case 1:
+ sel_z = y.z;
+ break;
+ case 2:
+ sel_z = z.z;
+ break;
+ case 3:
+ sel_z = w.z;
+ break;
+ }
+ float sel_w = 0;
+ switch (index.w) {
+ case 0:
+ sel_w = x.w;
+ break;
+ case 1:
+ sel_w = y.w;
+ break;
+ case 2:
+ sel_w = z.w;
+ break;
+ case 3:
+ sel_w = w.w;
+ break;
+ }
+ Float ret = {sel_x, sel_y, sel_z, sel_w};
+ return ret;
+ }
+
+ friend vec4 operator/(vec4 a, Float b) {
+ return vec4(a.x / b, a.y / b, a.z / b, a.w / b);
+ }
+ friend vec4 operator/(vec4 a, vec4 b) {
+ return vec4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+ }
+
+ friend vec4 operator*(vec4 a, Float b) {
+ return vec4(a.x * b, a.y * b, a.z * b, a.w * b);
+ }
+
+ friend vec4 operator*(Float b, vec4 a) {
+ return vec4(a.x * b, a.y * b, a.z * b, a.w * b);
+ }
+ friend vec4 operator*(vec4 a, vec4 b) {
+ return vec4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+ }
+
+ friend vec4 operator-(vec4 a, vec4 b) {
+ return vec4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+ }
+ friend vec4 operator+(vec4 a, vec4 b) {
+ return vec4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+ }
+ vec4& operator+=(vec4 a) {
+ x += a.x;
+ y += a.y;
+ z += a.z;
+ w += a.w;
+ return *this;
+ }
+ vec4& operator/=(vec4 a) {
+ x /= a.x;
+ y /= a.y;
+ z /= a.z;
+ w /= a.w;
+ return *this;
+ }
+ vec4& operator*=(Float a) {
+ x *= a;
+ y *= a;
+ z *= a;
+ w *= a;
+ return *this;
+ }
+
+ Float x;
+ Float y;
+ Float z;
+ Float w;
+};
+
+vec4_scalar force_scalar(const vec4& v) {
+ return vec4_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z),
+ force_scalar(v.w)};
+}
+
+vec4_scalar make_vec4(float n) { return vec4_scalar{n, n, n, n}; }
+
+vec4_scalar make_vec4(const vec2_scalar& v, float z, float w) {
+ return vec4_scalar{v.x, v.y, z, w};
+}
+
+vec4_scalar make_vec4(const vec2_scalar& a, const vec2_scalar& b) {
+ return vec4_scalar{a.x, a.y, b.x, b.y};
+}
+
+vec4_scalar make_vec4(const vec3_scalar& v, float w) {
+ return vec4_scalar{v.x, v.y, v.z, w};
+}
+
+vec4_scalar make_vec4(float x, float y, float z, float w) {
+ return vec4_scalar{x, y, z, w};
+}
+
+vec4_scalar make_vec4(float x, float y, const vec2_scalar& v) {
+ return vec4_scalar{x, y, v.x, v.y};
+}
+
+ivec4_scalar make_ivec4(const vec4_scalar& v) {
+ return ivec4_scalar{int32_t(v.x), int32_t(v.y), int32_t(v.z), int32_t(v.w)};
+}
+
+template <typename N>
+vec4 make_vec4(const N& n) {
+ return vec4(n);
+}
+
+template <typename X, typename Y>
+vec4 make_vec4(const X& x, const Y& y) {
+ return vec4(x, y);
+}
+
+template <typename X, typename Y, typename Z>
+vec4 make_vec4(const X& x, const Y& y, const Z& z) {
+ return vec4(x, y, z);
+}
+
+template <typename X, typename Y, typename Z, typename W>
+vec4 make_vec4(const X& x, const Y& y, const Z& z, const W& w) {
+ return vec4(x, y, z, w);
+}
+
+SI ivec4 roundfast(vec4 v, Float scale) {
+ return ivec4(roundfast(v.x, scale), roundfast(v.y, scale),
+ roundfast(v.z, scale), roundfast(v.w, scale));
+}
+
+vec4 operator*(vec4_scalar a, Float b) {
+ return vec4(a.x * b, a.y * b, a.z * b, a.w * b);
+}
+
+SI vec4 if_then_else(I32 c, vec4 t, vec4 e) {
+ return vec4(if_then_else(c, t.x, e.x), if_then_else(c, t.y, e.y),
+ if_then_else(c, t.z, e.z), if_then_else(c, t.w, e.w));
+}
+
+SI vec4 if_then_else(int32_t c, vec4 t, vec4 e) { return c ? t : e; }
+
+SI vec2 clamp(vec2 a, vec2 minVal, vec2 maxVal) {
+ return vec2(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y));
+}
+
+SI vec2_scalar clamp(vec2_scalar a, vec2_scalar minVal, vec2_scalar maxVal) {
+ return vec2_scalar{clamp(a.x, minVal.x, maxVal.x),
+ clamp(a.y, minVal.y, maxVal.y)};
+}
+
+SI I32 clamp(I32 a, I32 minVal, I32 maxVal) {
+ a = if_then_else(a < minVal, minVal, a);
+ return if_then_else(a > maxVal, maxVal, a);
+}
+
+SI vec3 clamp(vec3 a, vec3 minVal, vec3 maxVal) {
+ return vec3(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
+ clamp(a.z, minVal.z, maxVal.z));
+}
+
+SI vec4 clamp(vec4 a, vec4 minVal, vec4 maxVal) {
+ return vec4(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
+ clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w));
+}
+
+SI vec4_scalar clamp(vec4_scalar a, vec4_scalar minVal, vec4_scalar maxVal) {
+ return vec4_scalar{
+ clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
+ clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w)};
+}
+
+template <typename T>
+auto lessThanEqual(T x, T y) -> decltype(x <= y) {
+ return x <= y;
+}
+
+template <typename T>
+auto lessThan(T x, T y) -> decltype(x < y) {
+ return x < y;
+}
+
+SI bvec3 lessThanEqual(vec3 x, vec3 y) {
+ return bvec3(lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y),
+ lessThanEqual(x.z, y.z));
+}
+
+SI bvec2 lessThanEqual(vec2 x, vec2 y) {
+ return bvec2(lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y));
+}
+
+SI bvec2_scalar lessThanEqual(vec2_scalar x, vec2_scalar y) {
+ return bvec2_scalar{lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y)};
+}
+
+SI bvec4 lessThanEqual(vec4 x, vec4 y) {
+ return bvec4(lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y),
+ lessThanEqual(x.z, y.z), lessThanEqual(x.w, y.w));
+}
+
+SI bvec4_scalar lessThanEqual(vec4_scalar x, vec4_scalar y) {
+ return bvec4_scalar{lessThanEqual(x.x, y.x), lessThanEqual(x.y, y.y),
+ lessThanEqual(x.z, y.z), lessThanEqual(x.w, y.w)};
+}
+
+SI bvec2 lessThan(vec2 x, vec2 y) {
+ return bvec2(lessThan(x.x, y.x), lessThan(x.y, y.y));
+}
+
+template <typename T>
+auto greaterThan(T x, T y) -> decltype(x > y) {
+ return x > y;
+}
+
+bvec2 greaterThan(vec2 x, vec2 y) {
+ return bvec2(greaterThan(x.x, y.x), greaterThan(x.y, y.y));
+}
+
+template <typename T>
+auto greaterThanEqual(T x, T y) -> decltype(x >= y) {
+ return x >= y;
+}
+
+bvec4 greaterThanEqual(vec4 x, vec4 y) {
+ return bvec4(greaterThanEqual(x.x, y.x), greaterThanEqual(x.y, y.y),
+ greaterThanEqual(x.z, y.z), greaterThanEqual(x.w, y.w));
+}
+
+struct mat4_scalar;
+
+struct mat2_scalar {
+ vec2_scalar data[2];
+
+ mat2_scalar() = default;
+ IMPLICIT constexpr mat2_scalar(float a) {
+ data[0] = vec2_scalar(a);
+ data[1] = vec2_scalar(a);
+ }
+ constexpr mat2_scalar(vec2_scalar a, vec2_scalar b) {
+ data[0] = a;
+ data[1] = b;
+ }
+ IMPLICIT mat2_scalar(const mat4_scalar& mat);
+
+ vec2_scalar& operator[](int index) { return data[index]; }
+ const vec2_scalar& operator[](int index) const { return data[index]; }
+
+ friend vec2_scalar operator*(mat2_scalar m, vec2_scalar v) {
+ vec2_scalar u;
+ u.x = m[0].x * v.x + m[1].x * v.y;
+ u.y = m[0].y * v.x + m[1].y * v.y;
+ return u;
+ }
+
+ friend vec2 operator*(mat2_scalar m, vec2 v) {
+ vec2 u;
+ u.x = m[0].x * v.x + m[1].x * v.y;
+ u.y = m[0].y * v.x + m[1].y * v.y;
+ return u;
+ }
+
+ friend mat2_scalar operator*(mat2_scalar m, float f) {
+ mat2_scalar u = m;
+ u[0].x *= f;
+ u[0].y *= f;
+ u[1].x *= f;
+ u[1].y *= f;
+ return u;
+ }
+};
+
+struct mat4;
+
+struct mat2 {
+ vec2 data[2];
+
+ vec2& operator[](int index) { return data[index]; }
+ const vec2& operator[](int index) const { return data[index]; }
+ mat2() = default;
+
+ IMPLICIT mat2(Float a) {
+ data[0] = vec2(a);
+ data[1] = vec2(a);
+ }
+
+ mat2(vec2 a, vec2 b) {
+ data[0] = a;
+ data[1] = b;
+ }
+ IMPLICIT mat2(const mat4& mat);
+ IMPLICIT constexpr mat2(mat2_scalar s) {
+ data[0] = vec2(s.data[0]);
+ data[1] = vec2(s.data[1]);
+ }
+
+ friend vec2 operator*(mat2 m, vec2 v) {
+ vec2 u;
+ u.x = m[0].x * v.x + m[1].x * v.y;
+ u.y = m[0].y * v.x + m[1].y * v.y;
+ return u;
+ }
+ friend mat2 operator*(mat2 m, Float f) {
+ mat2 u = m;
+ u[0].x *= f;
+ u[0].y *= f;
+ u[1].x *= f;
+ u[1].y *= f;
+ return u;
+ }
+};
+
+mat2_scalar make_mat2(float n) { return mat2_scalar{{n, n}, {n, n}}; }
+
+mat2_scalar make_mat2(const mat2_scalar& m) { return m; }
+
+mat2_scalar make_mat2(const vec2_scalar& x, const vec2_scalar& y) {
+ return mat2_scalar{x, y};
+}
+
+template <typename N>
+mat2 make_mat2(const N& n) {
+ return mat2(n);
+}
+
+template <typename X, typename Y>
+mat2 make_mat2(const X& x, const Y& y) {
+ return mat2(x, y);
+}
+
+SI mat2 if_then_else(I32 c, mat2 t, mat2 e) {
+ return mat2(if_then_else(c, t[0], e[0]), if_then_else(c, t[0], e[1]));
+}
+
+SI mat2 if_then_else(int32_t c, mat2 t, mat2 e) { return c ? t : e; }
+
+struct mat3_scalar {
+ vec3_scalar data[3];
+
+ mat3_scalar() = default;
+ constexpr mat3_scalar(vec3_scalar a, vec3_scalar b, vec3_scalar c) {
+ data[0] = a;
+ data[1] = b;
+ data[2] = c;
+ }
+ IMPLICIT mat3_scalar(const mat4_scalar& mat);
+
+ vec3_scalar& operator[](int index) { return data[index]; }
+ const vec3_scalar& operator[](int index) const { return data[index]; }
+
+ friend vec3_scalar operator*(mat3_scalar m, vec3_scalar v) {
+ vec3_scalar u;
+ u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z;
+ u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z;
+ u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z;
+ return u;
+ }
+
+ friend vec3 operator*(mat3_scalar m, vec3 v) {
+ vec3 u;
+ u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z;
+ u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z;
+ u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z;
+ return u;
+ }
+};
+
+struct mat3 {
+ vec3 data[3];
+
+ vec3& operator[](int index) { return data[index]; }
+ const vec3& operator[](int index) const { return data[index]; }
+ mat3() = default;
+ mat3(vec3 a, vec3 b, vec3 c) {
+ data[0] = a;
+ data[1] = b;
+ data[2] = c;
+ }
+
+ IMPLICIT constexpr mat3(mat3_scalar s) {
+ data[0] = vec3(s.data[0]);
+ data[1] = vec3(s.data[1]);
+ data[2] = vec3(s.data[2]);
+ }
+ constexpr mat3(mat3_scalar s0, mat3_scalar s1, mat3_scalar s2,
+ mat3_scalar s3) {
+ data[0] = vec3(s0.data[0], s1.data[0], s2.data[0], s3.data[0]);
+ data[1] = vec3(s0.data[1], s1.data[1], s2.data[1], s3.data[1]);
+ data[2] = vec3(s0.data[2], s1.data[2], s2.data[2], s3.data[2]);
+ }
+
+ constexpr mat3(Float d1, Float d2, Float d3, Float d4, Float d5, Float d6,
+ Float d7, Float d8, Float d9) {
+ data[0] = vec3(d1, d2, d3);
+ data[1] = vec3(d4, d5, d6);
+ data[2] = vec3(d7, d8, d9);
+ }
+
+ IMPLICIT mat3(const mat4& mat);
+
+ friend vec3 operator*(mat3 m, vec3 v) {
+ vec3 u;
+ u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z;
+ u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z;
+ u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z;
+ return u;
+ }
+};
+
+mat3_scalar force_scalar(const mat3& v) {
+ return mat3_scalar{force_scalar(v[0]), force_scalar(v[1]),
+ force_scalar(v[2])};
+}
+
+mat3_scalar make_mat3(const mat3_scalar& m) { return m; }
+
+mat3_scalar make_mat3(const vec3_scalar& x, const vec3_scalar& y,
+ const vec3_scalar& z) {
+ return mat3_scalar{x, y, z};
+}
+
+constexpr mat3_scalar make_mat3(float m0, float m1, float m2, float m3,
+ float m4, float m5, float m6, float m7,
+ float m8) {
+ return mat3_scalar{{m0, m1, m2}, {m3, m4, m5}, {m6, m7, m8}};
+}
+
+template <typename N>
+mat3 make_mat3(const N& n) {
+ return mat3(n);
+}
+
+template <typename X, typename Y, typename Z>
+mat3 make_mat3(const X& x, const Y& y, const Z& z) {
+ return mat3(x, y, z);
+}
+
+struct mat4_scalar {
+ vec4_scalar data[4];
+
+ mat4_scalar() = default;
+ constexpr mat4_scalar(vec4_scalar a, vec4_scalar b, vec4_scalar c,
+ vec4_scalar d) {
+ data[0] = a;
+ data[1] = b;
+ data[2] = c;
+ data[3] = d;
+ }
+
+ vec4_scalar& operator[](int index) { return data[index]; }
+ const vec4_scalar& operator[](int index) const { return data[index]; }
+
+ static mat4_scalar load_from_ptr(const float* f) {
+ mat4_scalar m;
+ // XXX: hopefully this is in the right order
+ m.data[0] = vec4_scalar{f[0], f[1], f[2], f[3]};
+ m.data[1] = vec4_scalar{f[4], f[5], f[6], f[7]};
+ m.data[2] = vec4_scalar{f[8], f[9], f[10], f[11]};
+ m.data[3] = vec4_scalar{f[12], f[13], f[14], f[15]};
+ return m;
+ }
+
+ friend vec4_scalar operator*(mat4_scalar m, vec4_scalar v) {
+ vec4_scalar u;
+ u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w;
+ u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w;
+ u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w;
+ u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z + m[3].w * v.w;
+ return u;
+ }
+
+ friend vec4 operator*(mat4_scalar m, vec4 v) {
+ vec4 u;
+ u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w;
+ u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w;
+ u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w;
+ u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z + m[3].w * v.w;
+ return u;
+ }
+};
+
+struct mat4 {
+ vec4 data[4];
+
+ mat4() = default;
+ IMPLICIT constexpr mat4(mat4_scalar s) {
+ data[0] = vec4(s.data[0]);
+ data[1] = vec4(s.data[1]);
+ data[2] = vec4(s.data[2]);
+ data[3] = vec4(s.data[3]);
+ }
+
+ mat4(vec4 a, vec4 b, vec4 c, vec4 d) {
+ data[0] = a;
+ data[1] = b;
+ data[2] = c;
+ data[3] = d;
+ }
+
+ vec4& operator[](int index) { return data[index]; }
+ const vec4& operator[](int index) const { return data[index]; }
+
+ friend vec4 operator*(mat4 m, vec4 v) {
+ vec4 u;
+ u.x = m[0].x * v.x + m[1].x * v.y + m[2].x * v.z + m[3].x * v.w;
+ u.y = m[0].y * v.x + m[1].y * v.y + m[2].y * v.z + m[3].y * v.w;
+ u.z = m[0].z * v.x + m[1].z * v.y + m[2].z * v.z + m[3].z * v.w;
+ u.w = m[0].w * v.x + m[1].w * v.y + m[2].w * v.z + m[3].w * v.w;
+ return u;
+ }
+};
+
+mat3::mat3(const mat4& mat)
+ : mat3(vec3(mat[0].x, mat[0].y, mat[0].z),
+ vec3(mat[1].x, mat[1].y, mat[1].z),
+ vec3(mat[2].x, mat[2].y, mat[2].z)) {}
+
+IMPLICIT mat3_scalar::mat3_scalar(const mat4_scalar& mat)
+ : mat3_scalar(vec3_scalar(mat[0].x, mat[0].y, mat[0].z),
+ vec3_scalar(mat[1].x, mat[1].y, mat[1].z),
+ vec3_scalar(mat[2].x, mat[2].y, mat[2].z)) {}
+
+IMPLICIT mat2::mat2(const mat4& mat)
+ : mat2(vec2(mat[0].x, mat[0].y), vec2(mat[1].x, mat[1].y)) {}
+
+IMPLICIT mat2_scalar::mat2_scalar(const mat4_scalar& mat)
+ : mat2_scalar(vec2_scalar(mat[0].x, mat[0].y),
+ vec2_scalar(mat[1].x, mat[1].y)) {}
+
+mat2_scalar make_mat2(const mat4_scalar& m) { return mat2_scalar(m); }
+
+mat3_scalar make_mat3(const mat4_scalar& m) { return mat3_scalar(m); }
+
+mat4_scalar force_scalar(const mat4& v) {
+ return mat4_scalar(force_scalar(v[0]), force_scalar(v[1]), force_scalar(v[2]),
+ force_scalar(v[3]));
+}
+
+mat4_scalar make_mat4(const mat4_scalar& m) { return m; }
+
+mat4_scalar make_mat4(const vec4_scalar& x, const vec4_scalar& y,
+ const vec4_scalar& z, const vec4_scalar& w) {
+ return mat4_scalar{x, y, z, w};
+}
+
+constexpr mat4_scalar make_mat4(float m0, float m1, float m2, float m3,
+ float m4, float m5, float m6, float m7,
+ float m8, float m9, float m10, float m11,
+ float m12, float m13, float m14, float m15) {
+ return mat4_scalar{{m0, m1, m2, m3},
+ {m4, m5, m6, m7},
+ {m8, m9, m10, m11},
+ {m12, m13, m14, m15}};
+}
+
+template <typename N>
+mat4 make_mat4(const N& n) {
+ return mat4(n);
+}
+
+template <typename X, typename Y, typename Z, typename W>
+mat4 make_mat4(const X& x, const Y& y, const Z& z, const W& w) {
+ return mat4(x, y, z, w);
+}
+
+SI mat3 if_then_else(I32 c, mat3 t, mat3 e) {
+ return mat3{if_then_else(c, t[0], e[0]), if_then_else(c, t[1], e[1]),
+ if_then_else(c, t[2], e[2])};
+}
+
+SI mat3 if_then_else(int32_t c, mat3 t, mat3 e) { return c ? t : e; }
+
+SI mat4 if_then_else(I32 c, mat4 t, mat4 e) {
+ return mat4{if_then_else(c, t[0], e[0]), if_then_else(c, t[1], e[1]),
+ if_then_else(c, t[2], e[2]), if_then_else(c, t[3], e[3])};
+}
+
+SI mat4 if_then_else(int32_t c, mat4 t, mat4 e) { return c ? t : e; }
+
+template <typename T, typename U, typename A,
+ typename R = typename T::vector_type>
+SI R mix(T x, U y, A a) {
+ return (y - x) * a + x;
+}
+
+SI Float mix(Float x, Float y, Float a) { return (y - x) * a + x; }
+
+template <typename T>
+SI T mix(T x, T y, float a) {
+ return (y - x) * a + x;
+}
+
+template <typename T>
+SI T mix(T x, T y, vec4_scalar a) {
+ return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z),
+ mix(x.w, y.w, a.w)};
+}
+
+ivec4 ivec2::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+ return ivec4(select(c1), select(c2), select(c3), select(c4));
+}
+
+vec4 vec2::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
+ return vec4(select(c1), select(c2), select(c3), select(c4));
+}
+
+bool any(bool x) { return x; }
+
+Bool any(bvec4 x) { return x.x | x.y | x.z | x.w; }
+
+bool any(bvec4_scalar x) { return x.x | x.y | x.z | x.w; }
+
+Bool any(bvec2 x) { return x.x | x.y; }
+
+bool any(bvec2_scalar x) { return x.x | x.y; }
+
+bool all(bool x) { return x; }
+
+Bool all(bvec2 x) { return x.x & x.y; }
+
+bool all(bvec2_scalar x) { return x.x & x.y; }
+
+Bool all(bvec4 x) { return x.x & x.y & x.z & x.w; }
+
+bool all(bvec4_scalar x) { return x.x & x.y & x.z & x.w; }
+
+SI vec4 if_then_else(bvec4 c, vec4 t, vec4 e) {
+ return vec4(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y),
+ if_then_else(c.z, t.z, e.z), if_then_else(c.w, t.w, e.w));
+}
+SI vec3 if_then_else(bvec3 c, vec3 t, vec3 e) {
+ return vec3(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y),
+ if_then_else(c.z, t.z, e.z));
+}
+
+SI vec2 if_then_else(bvec2 c, vec2 t, vec2 e) {
+ return vec2(if_then_else(c.x, t.x, e.x), if_then_else(c.y, t.y, e.y));
+}
+
+template <typename T, typename R = typename T::vector_type>
+SI R mix(T x, T y, bvec4 a) {
+ return if_then_else(a, y, x);
+}
+
+template <typename T, typename R = typename T::vector_type>
+SI R mix(T x, T y, bvec3 a) {
+ return if_then_else(a, y, x);
+}
+
+template <typename T, typename R = typename T::vector_type>
+SI R mix(T x, T y, bvec2 a) {
+ return if_then_else(a, y, x);
+}
+
+template <typename T>
+SI T mix(T x, T y, bvec4_scalar a) {
+ return T{a.x ? y.x : x.x, a.y ? y.y : x.y, a.z ? y.z : x.z, a.w ? y.w : x.w};
+}
+
+template <typename T>
+SI T mix(T x, T y, bvec3_scalar a) {
+ return T{a.x ? y.x : x.x, a.y ? y.y : x.y, a.z ? y.z : x.z};
+}
+
+template <typename T>
+SI T mix(T x, T y, bvec2_scalar a) {
+ return T{a.x ? y.x : x.x, a.y ? y.y : x.y};
+}
+
+float dot(vec3_scalar a, vec3_scalar b) {
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+Float dot(vec3 a, vec3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
+
+float dot(vec2_scalar a, vec2_scalar b) { return a.x * b.x + a.y * b.y; }
+
+Float dot(vec2 a, vec2 b) { return a.x * b.x + a.y * b.y; }
+
+#define sin __glsl_sin
+
+float sin(float x) { return sinf(x); }
+
+Float sin(Float v) { return {sinf(v.x), sinf(v.y), sinf(v.z), sinf(v.w)}; }
+
+#define cos __glsl_cos
+
+float cos(float x) { return cosf(x); }
+
+Float cos(Float v) { return {cosf(v.x), cosf(v.y), cosf(v.z), cosf(v.w)}; }
+
+#define tan __glsl_tan
+
+float tan(float x) { return tanf(x); }
+
+Float tan(Float v) { return {tanf(v.x), tanf(v.y), tanf(v.z), tanf(v.w)}; }
+
+#define atan __glsl_atan
+
+float atan(float x) { return atanf(x); }
+
+Float atan(Float v) { return {atanf(v.x), atanf(v.y), atanf(v.z), atanf(v.w)}; }
+
+float atan(float a, float b) { return atan2f(a, b); }
+
+Float atan(Float a, Float b) {
+ return {atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z),
+ atan2f(a.w, b.w)};
+}
+
+bvec4 notEqual(ivec4 a, ivec4 b) {
+ return bvec4(a.x != b.x, a.y != b.y, a.z != b.z, a.w != b.w);
+}
+
+bvec4_scalar notEqual(ivec4_scalar a, ivec4_scalar b) {
+ return bvec4_scalar{a.x != b.x, a.y != b.y, a.z != b.z, a.w != b.w};
+}
+
+mat3 transpose(mat3 m) {
+ return mat3(vec3(m[0].x, m[1].x, m[2].x), vec3(m[0].y, m[1].y, m[2].y),
+ vec3(m[0].z, m[1].z, m[2].z));
+}
+
+mat3_scalar transpose(mat3_scalar m) {
+ return mat3_scalar{vec3_scalar(m[0].x, m[1].x, m[2].x),
+ vec3_scalar(m[0].y, m[1].y, m[2].y),
+ vec3_scalar(m[0].z, m[1].z, m[2].z)};
+}
+
+vec2 abs(vec2 v) { return vec2(abs(v.x), abs(v.y)); }
+
+vec2_scalar abs(vec2_scalar v) { return vec2_scalar{fabsf(v.x), fabsf(v.y)}; }
+
+Float mod(Float a, Float b) { return a - b * floor(a / b); }
+
+vec2 mod(vec2 a, vec2 b) { return vec2(mod(a.x, b.x), mod(a.y, b.y)); }
+
+vec3 abs(vec3 v) { return vec3(abs(v.x), abs(v.y), abs(v.z)); }
+
+mat2 inverse(mat2 v) {
+ Float det = v[0].x * v[1].y - v[0].y * v[1].x;
+ return mat2(vec2(v[1].y, -v[0].y), vec2(-v[1].x, v[0].x)) * (1. / det);
+}
+
+mat2_scalar inverse(mat2_scalar v) {
+ float det = v[0].x * v[1].y - v[0].y * v[1].x;
+ return mat2_scalar{{v[1].y, -v[0].y}, {-v[1].x, v[0].x}} * (1. / det);
+}
+
+int32_t get_nth(I32 a, int n) { return a[n]; }
+
+float get_nth(Float a, int n) { return a[n]; }
+
+float get_nth(float a, int) { return a; }
+
+ivec2_scalar get_nth(ivec2 a, int n) { return ivec2_scalar{a.x[n], a.y[n]}; }
+
+vec2_scalar get_nth(vec2 a, int n) { return vec2_scalar{a.x[n], a.y[n]}; }
+
+vec3_scalar get_nth(vec3 a, int n) {
+ return vec3_scalar{a.x[n], a.y[n], a.z[n]};
+}
+
+vec4_scalar get_nth(vec4 a, int n) {
+ return vec4_scalar{a.x[n], a.y[n], a.z[n], a.w[n]};
+}
+
+ivec4_scalar get_nth(ivec4 a, int n) {
+ return ivec4_scalar{a.x[n], a.y[n], a.z[n], a.w[n]};
+}
+
+mat3_scalar get_nth(mat3 a, int n) {
+ return make_mat3(get_nth(a[0], n), get_nth(a[1], n), get_nth(a[2], n));
+}
+
+void put_nth(Float& dst, int n, float src) { dst[n] = src; }
+
+void put_nth(I32& dst, int n, int32_t src) { dst[n] = src; }
+
+void put_nth(ivec2& dst, int n, ivec2_scalar src) {
+ dst.x[n] = src.x;
+ dst.y[n] = src.y;
+}
+
+void put_nth(vec2& dst, int n, vec2_scalar src) {
+ dst.x[n] = src.x;
+ dst.y[n] = src.y;
+}
+
+void put_nth(vec3& dst, int n, vec3_scalar src) {
+ dst.x[n] = src.x;
+ dst.y[n] = src.y;
+ dst.z[n] = src.z;
+}
+
+void put_nth(ivec4& dst, int n, ivec4_scalar src) {
+ dst.x[n] = src.x;
+ dst.y[n] = src.y;
+ dst.z[n] = src.z;
+ dst.w[n] = src.w;
+}
+
+void put_nth(vec4& dst, int n, vec4_scalar src) {
+ dst.x[n] = src.x;
+ dst.y[n] = src.y;
+ dst.z[n] = src.z;
+ dst.w[n] = src.w;
+}
+
+// Use an ElementType type constructor
+// so that we can implement element_type for
+// Int and Float
+template <typename V>
+struct ElementType {
+ typedef typename V::element_type ty;
+};
+
+template <>
+struct ElementType<float> {
+ typedef float ty;
+};
+
+template <>
+struct ElementType<int> {
+ typedef float ty;
+};
+
+template <>
+struct ElementType<Float> {
+ typedef float ty;
+};
+
+template <>
+struct ElementType<I32> {
+ typedef int32_t ty;
+};
+
+void put_nth_component(ivec2_scalar& dst, int n, int32_t src) {
+ switch (n) {
+ case 0:
+ dst.x = src;
+ break;
+ case 1:
+ dst.y = src;
+ break;
+ }
+}
+
+void put_nth_component(ivec4_scalar& dst, int n, int32_t src) {
+ switch (n) {
+ case 0:
+ dst.x = src;
+ break;
+ case 1:
+ dst.y = src;
+ break;
+ case 2:
+ dst.z = src;
+ break;
+ case 3:
+ dst.w = src;
+ break;
+ }
+}
+
+void put_nth_component(int& dst, int n, int src) {
+ switch (n) {
+ case 0:
+ dst = src;
+ break;
+ }
+}
+
+void put_nth_component(float& dst, int n, float src) {
+ switch (n) {
+ case 0:
+ dst = src;
+ break;
+ }
+}
+
+void put_nth_component(vec2_scalar& dst, int n, float src) {
+ switch (n) {
+ case 0:
+ dst.x = src;
+ break;
+ case 1:
+ dst.y = src;
+ break;
+ }
+}
+
+void put_nth_component(vec3_scalar& dst, int n, float src) {
+ switch (n) {
+ case 0:
+ dst.x = src;
+ break;
+ case 1:
+ dst.y = src;
+ break;
+ case 2:
+ dst.z = src;
+ break;
+ }
+}
+
+void put_nth_component(vec4_scalar& dst, int n, float src) {
+ switch (n) {
+ case 0:
+ dst.x = src;
+ break;
+ case 1:
+ dst.y = src;
+ break;
+ case 2:
+ dst.z = src;
+ break;
+ case 3:
+ dst.w = src;
+ break;
+ }
+}
+
+Float init_interp(float init0, float step) {
+ float init1 = init0 + step;
+ float init2 = init1 + step;
+ float init3 = init2 + step;
+ return {init0, init1, init2, init3};
+}
+
+vec2 init_interp(vec2_scalar init, vec2_scalar step) {
+ return vec2(init_interp(init.x, step.x), init_interp(init.y, step.y));
+}
+
+vec3 init_interp(vec3_scalar init, vec3_scalar step) {
+ return vec3(init_interp(init.x, step.x), init_interp(init.y, step.y),
+ init_interp(init.z, step.z));
+}
+
+vec4 init_interp(vec4_scalar init, vec4_scalar step) {
+ return vec4(init_interp(init.x, step.x), init_interp(init.y, step.y),
+ init_interp(init.z, step.z), init_interp(init.w, step.w));
+}
+
+template <typename T, size_t N>
+struct Array {
+ T elements[N];
+ T& operator[](size_t i) { return elements[i]; }
+ const T& operator[](size_t i) const { return elements[i]; }
+ template <typename S>
+ void convert(const Array<S, N>& s) {
+ for (size_t i = 0; i < N; ++i) elements[i] = T(s[i]);
+ }
+};
+
+template <size_t SIZE>
+Array<vec2, SIZE> if_then_else(I32 c, Array<vec2, SIZE> t,
+ Array<vec2, SIZE> e) {
+ Array<vec2, SIZE> r;
+ for (size_t i = 0; i < SIZE; i++) {
+ r[i] = if_then_else(c, t[i], e[i]);
+ }
+ return r;
+}
+
+} // namespace glsl
diff --git a/gfx/wr/swgl/src/lib.rs b/gfx/wr/swgl/src/lib.rs
new file mode 100644
index 0000000000..e8fc030e0c
--- /dev/null
+++ b/gfx/wr/swgl/src/lib.rs
@@ -0,0 +1,12 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#![crate_name = "swgl"]
+#![crate_type = "lib"]
+
+extern crate gleam;
+
+mod swgl_fns;
+
+pub use crate::swgl_fns::*;
diff --git a/gfx/wr/swgl/src/program.h b/gfx/wr/swgl/src/program.h
new file mode 100644
index 0000000000..fcc6714c82
--- /dev/null
+++ b/gfx/wr/swgl/src/program.h
@@ -0,0 +1,166 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+struct VertexAttrib;
+
+namespace glsl {
+
+// Type holding group of scalars interpolated across rasterized rows and spans,
+// shuttling values between vertex shaders and fragment shaders.
+// GCC requires power-of-two vector sizes, so must use glsl type as workaround
+// to operate in Float-sized chunks.
+typedef vec3 Interpolants;
+
+struct VertexShaderImpl;
+struct FragmentShaderImpl;
+
+struct ProgramImpl {
+ virtual ~ProgramImpl() {}
+ virtual int get_uniform(const char* name) const = 0;
+ virtual void bind_attrib(const char* name, int index) = 0;
+ virtual int get_attrib(const char* name) const = 0;
+ virtual size_t interpolants_size() const = 0;
+ virtual VertexShaderImpl* get_vertex_shader() = 0;
+ virtual FragmentShaderImpl* get_fragment_shader() = 0;
+ virtual const char* get_name() const = 0;
+};
+
+typedef ProgramImpl* (*ProgramLoader)();
+
+struct VertexShaderImpl {
+ typedef void (*SetUniform1iFunc)(VertexShaderImpl*, int index, int value);
+ typedef void (*SetUniform4fvFunc)(VertexShaderImpl*, int index,
+ const float* value);
+ typedef void (*SetUniformMatrix4fvFunc)(VertexShaderImpl*, int index,
+ const float* value);
+ typedef void (*InitBatchFunc)(VertexShaderImpl*);
+ typedef void (*LoadAttribsFunc)(VertexShaderImpl*, VertexAttrib* attribs,
+ uint32_t start, int instance, int count);
+ typedef void (*RunPrimitiveFunc)(VertexShaderImpl*, char* interps,
+ size_t interp_stride);
+
+ SetUniform1iFunc set_uniform_1i_func = nullptr;
+ SetUniform4fvFunc set_uniform_4fv_func = nullptr;
+ SetUniformMatrix4fvFunc set_uniform_matrix4fv_func = nullptr;
+ InitBatchFunc init_batch_func = nullptr;
+ LoadAttribsFunc load_attribs_func = nullptr;
+ RunPrimitiveFunc run_primitive_func = nullptr;
+
+ vec4 gl_Position;
+
+ void set_uniform_1i(int index, int value) {
+ (*set_uniform_1i_func)(this, index, value);
+ }
+
+ void set_uniform_4fv(int index, const float* value) {
+ (*set_uniform_4fv_func)(this, index, value);
+ }
+
+ void set_uniform_matrix4fv(int index, const float* value) {
+ (*set_uniform_matrix4fv_func)(this, index, value);
+ }
+
+ void init_batch() { (*init_batch_func)(this); }
+
+ ALWAYS_INLINE void load_attribs(VertexAttrib* attribs, uint32_t start,
+ int instance, int count) {
+ (*load_attribs_func)(this, attribs, start, instance, count);
+ }
+
+ ALWAYS_INLINE void run_primitive(char* interps, size_t interp_stride) {
+ (*run_primitive_func)(this, interps, interp_stride);
+ }
+};
+
+struct FragmentShaderImpl {
+ typedef void (*InitSpanFunc)(FragmentShaderImpl*, const void* interps,
+ const void* step);
+ typedef void (*RunFunc)(FragmentShaderImpl*);
+ typedef void (*SkipFunc)(FragmentShaderImpl*, int steps);
+ typedef void (*InitSpanWFunc)(FragmentShaderImpl*, const void* interps,
+ const void* step);
+ typedef void (*RunWFunc)(FragmentShaderImpl*);
+ typedef void (*SkipWFunc)(FragmentShaderImpl*, int steps);
+ typedef void (*DrawSpanRGBA8Func)(FragmentShaderImpl*);
+ typedef void (*DrawSpanR8Func)(FragmentShaderImpl*);
+
+ InitSpanFunc init_span_func = nullptr;
+ RunFunc run_func = nullptr;
+ SkipFunc skip_func = nullptr;
+ InitSpanWFunc init_span_w_func = nullptr;
+ RunWFunc run_w_func = nullptr;
+ SkipWFunc skip_w_func = nullptr;
+ DrawSpanRGBA8Func draw_span_RGBA8_func = nullptr;
+ DrawSpanR8Func draw_span_R8_func = nullptr;
+
+ enum FLAGS {
+ DISCARD = 1 << 0,
+ PERSPECTIVE = 1 << 1,
+ };
+ int flags = 0;
+ void enable_discard() { flags |= DISCARD; }
+ void enable_perspective() { flags |= PERSPECTIVE; }
+ ALWAYS_INLINE bool use_discard() const { return (flags & DISCARD) != 0; }
+ ALWAYS_INLINE bool use_perspective() const {
+ return (flags & PERSPECTIVE) != 0;
+ }
+
+ vec4 gl_FragCoord;
+ vec4 gl_FragColor;
+ vec4 gl_SecondaryFragColor;
+
+ vec2_scalar swgl_StepZW;
+ Bool swgl_IsPixelDiscarded = false;
+ // The current buffer position for committing span output.
+ uint32_t* swgl_OutRGBA8 = nullptr;
+ uint8_t* swgl_OutR8 = nullptr;
+ // The remaining number of pixels in the span.
+ int32_t swgl_SpanLength = 0;
+ // The number of pixels in a step.
+ enum : int32_t { swgl_StepSize = 4 };
+
+ ALWAYS_INLINE void step_fragcoord(int steps = 4) { gl_FragCoord.x += steps; }
+
+ ALWAYS_INLINE void step_perspective(int steps = 4) {
+ gl_FragCoord.z += swgl_StepZW.x * steps;
+ gl_FragCoord.w += swgl_StepZW.y * steps;
+ }
+
+ template <bool W = false>
+ ALWAYS_INLINE void init_span(const void* interps, const void* step) {
+ (*(W ? init_span_w_func : init_span_func))(this, interps, step);
+ }
+
+ template <bool W = false>
+ ALWAYS_INLINE void run() {
+ (*(W ? run_w_func : run_func))(this);
+ }
+
+ template <bool W = false>
+ ALWAYS_INLINE void skip(int steps = 4) {
+ (*(W ? skip_w_func : skip_func))(this, steps);
+ }
+
+ ALWAYS_INLINE void draw_span(uint32_t* buf, int len) {
+ swgl_OutRGBA8 = buf;
+ swgl_SpanLength = len;
+ (*draw_span_RGBA8_func)(this);
+ }
+
+ ALWAYS_INLINE bool has_draw_span(uint32_t*) {
+ return draw_span_RGBA8_func != nullptr;
+ }
+
+ ALWAYS_INLINE void draw_span(uint8_t* buf, int len) {
+ swgl_OutR8 = buf;
+ swgl_SpanLength = len;
+ (*draw_span_R8_func)(this);
+ }
+
+ ALWAYS_INLINE bool has_draw_span(uint8_t*) {
+ return draw_span_R8_func != nullptr;
+ }
+};
+
+} // namespace glsl
diff --git a/gfx/wr/swgl/src/swgl_ext.h b/gfx/wr/swgl/src/swgl_ext.h
new file mode 100644
index 0000000000..fd4e587889
--- /dev/null
+++ b/gfx/wr/swgl/src/swgl_ext.h
@@ -0,0 +1,532 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+static inline void commit_span(uint32_t* buf, WideRGBA8 r) {
+ if (blend_key) r = blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), r);
+ unaligned_store(buf, pack(r));
+}
+
+static inline void commit_span(uint32_t* buf, PackedRGBA8 r) {
+ if (blend_key)
+ r = pack(blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), unpack(r)));
+ unaligned_store(buf, r);
+}
+
+UNUSED static inline void commit_solid_span(uint32_t* buf, WideRGBA8 r,
+ int len) {
+ if (blend_key) {
+ for (uint32_t* end = &buf[len & ~3]; buf < end; buf += 4) {
+ unaligned_store(
+ buf, pack(blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), r)));
+ }
+ len &= 3;
+ if (len > 0) {
+ partial_store_span(
+ buf,
+ pack(blend_pixels(buf, partial_load_span<PackedRGBA8>(buf, len), r,
+ len)),
+ len);
+ }
+ } else {
+ fill_n(buf, len, bit_cast<U32>(pack(r)).x);
+ }
+}
+
+static inline void commit_span(uint8_t* buf, WideR8 r) {
+ if (blend_key)
+ r = blend_pixels(buf, unpack(unaligned_load<PackedR8>(buf)), r);
+ unaligned_store(buf, pack(r));
+}
+
+UNUSED static inline void commit_solid_span(uint8_t* buf, WideR8 r, int len) {
+ if (blend_key) {
+ for (uint8_t* end = &buf[len]; buf < end; buf += 4) {
+ unaligned_store(buf, pack(blend_pixels(
+ buf, unpack(unaligned_load<PackedR8>(buf)), r)));
+ }
+ } else {
+ fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(pack(r)));
+ }
+}
+
+template <typename V>
+static inline WideRGBA8 pack_span(uint32_t*, const V& v) {
+ return pack_pixels_RGBA8(v);
+}
+
+static inline WideRGBA8 pack_span(uint32_t*) { return pack_pixels_RGBA8(); }
+
+template <typename C>
+static inline WideR8 pack_span(uint8_t*, C c) {
+ return pack_pixels_R8(c);
+}
+
+static inline WideR8 pack_span(uint8_t*) { return pack_pixels_R8(); }
+
+// Forces a value with vector run-class to have scalar run-class.
+template <typename T>
+static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) {
+ return force_scalar(v);
+}
+
+// Advance all varying inperpolants by a single chunk
+#define swgl_stepInterp() step_interp_inputs()
+
+// Pseudo-intrinsic that accesses the interpolation step for a given varying
+#define swgl_interpStep(v) (interp_step.v)
+
+// Commit an entire span of a solid color
+#define swgl_commitSolid(format, v) \
+ do { \
+ commit_solid_span(swgl_Out##format, pack_span(swgl_Out##format, (v)), \
+ swgl_SpanLength); \
+ swgl_Out##format += swgl_SpanLength; \
+ swgl_SpanLength = 0; \
+ } while (0)
+#define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v)
+#define swgl_commitSolidR8(v) swgl_commitSolid(R8, v)
+
+#define swgl_commitChunk(format, chunk) \
+ do { \
+ commit_span(swgl_Out##format, chunk); \
+ swgl_Out##format += swgl_StepSize; \
+ swgl_SpanLength -= swgl_StepSize; \
+ } while (0)
+
+static inline WideRGBA8 pack_pixels_RGBA8(Float alpha) {
+ I32 i = round_pixel(alpha);
+ HalfRGBA8 c = packRGBA8(zipLow(i, i), zipHigh(i, i));
+ return combine(zipLow(c, c), zipHigh(c, c));
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8(float alpha) {
+ I32 i = round_pixel(alpha);
+ HalfRGBA8 c = packRGBA8(i, i);
+ return combine(c, c);
+}
+
+// Commit a single chunk of a color scaled by an alpha weight
+#define swgl_commitColor(format, color, alpha) \
+ swgl_commitChunk(format, muldiv255(pack_pixels_##format(color), \
+ pack_pixels_##format(alpha)))
+#define swgl_commitColorRGBA8(color, alpha) \
+ swgl_commitColor(RGBA8, color, alpha)
+#define swgl_commitColorR8(color, alpha) swgl_commitColor(R8, color, alpha)
+
+template <typename S>
+static ALWAYS_INLINE bool swgl_isTextureLinear(S s) {
+ return s->filter == TextureFilter::LINEAR;
+}
+
+template <typename S>
+static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) {
+ return s->format == TextureFormat::RGBA8;
+}
+
+template <typename S>
+static ALWAYS_INLINE bool swgl_isTextureR8(S s) {
+ return s->format == TextureFormat::R8;
+}
+
+// Returns the offset into the texture buffer for the given layer index. If not
+// a texture array or 3D texture, this will always access the first layer.
+template <typename S>
+static ALWAYS_INLINE int swgl_textureLayerOffset(S s, float layer) {
+ return 0;
+}
+
+UNUSED static ALWAYS_INLINE int swgl_textureLayerOffset(sampler2DArray s,
+ float layer) {
+ return clampCoord(int(layer), s->depth) * s->height_stride;
+}
+
+// Use the default linear quantization scale of 128. This gives 7 bits of
+// fractional precision, which when multiplied with a signed 9 bit value
+// still fits in a 16 bit integer.
+const int swgl_LinearQuantizeScale = 128;
+
+// Quantizes UVs for access into a linear texture.
+template <typename S, typename T>
+static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) {
+ return linearQuantize(p, swgl_LinearQuantizeScale, s);
+}
+
+// Quantizes an interpolation step for UVs for access into a linear texture.
+template <typename S, typename T>
+static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) {
+ return samplerScale(s, p) * swgl_LinearQuantizeScale;
+}
+
+// Commit a single chunk from a linear texture fetch
+#define swgl_commitTextureLinear(format, s, p, ...) \
+ swgl_commitChunk(format, \
+ textureLinearUnpacked##format(s, ivec2(p), __VA_ARGS__))
+#define swgl_commitTextureLinearRGBA8(s, p, ...) \
+ swgl_commitTextureLinear(RGBA8, s, p, __VA_ARGS__)
+#define swgl_commitTextureLinearR8(s, p, ...) \
+ swgl_commitTextureLinear(R8, s, p, __VA_ARGS__)
+
+// Commit a single chunk from a linear texture fetch that is scaled by a color
+#define swgl_commitTextureLinearColor(format, s, p, color, ...) \
+ swgl_commitChunk(format, muldiv255(textureLinearUnpacked##format( \
+ s, ivec2(p), __VA_ARGS__), \
+ pack_pixels_##format(color)))
+#define swgl_commitTextureLinearColorRGBA8(s, p, color, ...) \
+ swgl_commitTextureLinearColor(RGBA8, s, p, color, __VA_ARGS__)
+#define swgl_commitTextureLinearColorR8(s, p, color, ...) \
+ swgl_commitTextureLinearColor(R8, s, p, color, __VA_ARGS__)
+
+// Commit an entire span of a separable pass of a Gaussian blur that falls
+// within the given radius scaled by supplied coefficients, clamped to uv_rect
+// bounds.
+#define swgl_commitGaussianBlur(format, type, s, p, uv_rect, hori, radius, \
+ coeffs, ...) \
+ do { \
+ vec2_scalar size = {float(s->width), float(s->height)}; \
+ ivec2_scalar curUV = make_ivec2(force_scalar(p) * size); \
+ ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size)); \
+ int endX = min(bounds.z, curUV.x + swgl_SpanLength * swgl_StepSize); \
+ if (hori) { \
+ for (; curUV.x + swgl_StepSize <= endX; curUV.x += swgl_StepSize) { \
+ swgl_commitChunk(format, gaussianBlurHorizontal<type>( \
+ s, curUV, bounds.x, bounds.z, radius, \
+ coeffs.x, coeffs.y, __VA_ARGS__)); \
+ } \
+ } else { \
+ for (; curUV.x + swgl_StepSize <= endX; curUV.x += swgl_StepSize) { \
+ swgl_commitChunk(format, gaussianBlurVertical<type>( \
+ s, curUV, bounds.y, bounds.w, radius, \
+ coeffs.x, coeffs.y, __VA_ARGS__)); \
+ } \
+ } \
+ } while (0)
+#define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs, ...) \
+ swgl_commitGaussianBlur(RGBA8, uint32_t, s, p, uv_rect, hori, radius, \
+ coeffs, __VA_ARGS__)
+#define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs, ...) \
+ swgl_commitGaussianBlur(R8, uint8_t, s, p, uv_rect, hori, radius, coeffs, \
+ __VA_ARGS__)
+
+// Convert and pack planar YUV samples to RGB output using a color space
+static ALWAYS_INLINE PackedRGBA8 convertYUV(int colorSpace, U16 y, U16 u,
+ U16 v) {
+ auto yy = V8<int16_t>(zip(y, y));
+ auto uv = V8<int16_t>(zip(u, v));
+ switch (colorSpace) {
+ case REC_601:
+ return YUVConverter<REC_601>::convert(yy, uv);
+ case REC_709:
+ return YUVConverter<REC_709>::convert(yy, uv);
+ case REC_2020:
+ return YUVConverter<REC_2020>::convert(yy, uv);
+ default:
+ return YUVConverter<IDENTITY>::convert(yy, uv);
+ }
+}
+
+// Helper functions to sample from planar YUV textures before converting to RGB
+template <typename S0>
+static inline PackedRGBA8 sampleYUV(S0 sampler0, vec2 uv0, int layer0,
+ int colorSpace, int rescaleFactor) {
+ ivec2 i0(uv0);
+ switch (sampler0->format) {
+ case TextureFormat::RGBA8: {
+ auto planar = textureLinearPlanarRGBA8(sampler0, i0, layer0);
+ return convertYUV(colorSpace, highHalf(planar.rg), lowHalf(planar.rg),
+ lowHalf(planar.ba));
+ }
+ case TextureFormat::YUV422: {
+ auto planar = textureLinearPlanarYUV422(sampler0, i0, layer0);
+ return convertYUV(colorSpace, planar.y, planar.u, planar.v);
+ }
+ default:
+ assert(false);
+ return PackedRGBA8(0);
+ }
+}
+
+template <typename S0, typename C>
+static inline WideRGBA8 sampleColorYUV(S0 sampler0, vec2 uv0, int layer0,
+ int colorSpace, int rescaleFactor,
+ C color) {
+ return muldiv255(
+ unpack(sampleYUV(sampler0, uv0, layer0, colorSpace, rescaleFactor)),
+ pack_pixels_RGBA8(color));
+}
+
+template <typename S0, typename S1>
+static inline PackedRGBA8 sampleYUV(S0 sampler0, vec2 uv0, int layer0,
+ S1 sampler1, vec2 uv1, int layer1,
+ int colorSpace, int rescaleFactor) {
+ ivec2 i0(uv0);
+ ivec2 i1(uv1);
+ switch (sampler1->format) {
+ case TextureFormat::RG8: {
+ assert(sampler0->format == TextureFormat::R8);
+ auto y = textureLinearUnpackedR8(sampler0, i0, layer0);
+ auto planar = textureLinearPlanarRG8(sampler1, i1, layer1);
+ return convertYUV(colorSpace, y, lowHalf(planar.rg), highHalf(planar.rg));
+ }
+ case TextureFormat::RGBA8: {
+ assert(sampler0->format == TextureFormat::R8);
+ auto y = textureLinearUnpackedR8(sampler0, i0, layer0);
+ auto planar = textureLinearPlanarRGBA8(sampler1, i1, layer1);
+ return convertYUV(colorSpace, y, lowHalf(planar.ba), highHalf(planar.rg));
+ }
+ default:
+ assert(false);
+ return PackedRGBA8(0);
+ }
+}
+
+template <typename S0, typename S1, typename C>
+static inline WideRGBA8 sampleColorYUV(S0 sampler0, vec2 uv0, int layer0,
+ S1 sampler1, vec2 uv1, int layer1,
+ int colorSpace, int rescaleFactor,
+ C color) {
+ return muldiv255(unpack(sampleYUV(sampler0, uv0, layer0, sampler1, uv1,
+ layer1, colorSpace, rescaleFactor)),
+ pack_pixels_RGBA8(color));
+}
+
+template <typename S0, typename S1, typename S2>
+static inline PackedRGBA8 sampleYUV(S0 sampler0, vec2 uv0, int layer0,
+ S1 sampler1, vec2 uv1, int layer1,
+ S2 sampler2, vec2 uv2, int layer2,
+ int colorSpace, int rescaleFactor) {
+ ivec2 i0(uv0);
+ ivec2 i1(uv1);
+ ivec2 i2(uv2);
+ assert(sampler0->format == sampler1->format &&
+ sampler0->format == sampler2->format);
+ switch (sampler0->format) {
+ case TextureFormat::R8: {
+ auto y = textureLinearUnpackedR8(sampler0, i0, layer0);
+ auto u = textureLinearUnpackedR8(sampler1, i1, layer1);
+ auto v = textureLinearUnpackedR8(sampler2, i2, layer2);
+ return convertYUV(colorSpace, y, u, v);
+ }
+ case TextureFormat::R16: {
+ // The rescaling factor represents how many bits to add to renormalize the
+ // texture to 16 bits, and so the color depth is actually 16 minus the
+ // rescaling factor.
+ // Need to right shift the sample by the amount of bits over 8 it
+ // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
+ // of precision at the low end already, hence 1 is subtracted from the
+ // color depth.
+ int colorDepth = 16 - rescaleFactor;
+ int rescaleBits = (colorDepth - 1) - 8;
+ auto y = textureLinearUnpackedR16(sampler0, i0, layer0) >> rescaleBits;
+ auto u = textureLinearUnpackedR16(sampler1, i1, layer1) >> rescaleBits;
+ auto v = textureLinearUnpackedR16(sampler2, i2, layer2) >> rescaleBits;
+ return convertYUV(colorSpace, U16(y), U16(u), U16(v));
+ }
+ default:
+ assert(false);
+ return PackedRGBA8(0);
+ }
+}
+
+template <typename S0, typename S1, typename S2, typename C>
+static inline WideRGBA8 sampleColorYUV(S0 sampler0, vec2 uv0, int layer0,
+ S1 sampler1, vec2 uv1, int layer1,
+ S2 sampler2, vec2 uv2, int layer2,
+ int colorSpace, int rescaleFactor,
+ C color) {
+ return muldiv255(
+ unpack(sampleYUV(sampler0, uv0, layer0, sampler1, uv1, layer1, sampler2,
+ uv2, layer2, colorSpace, rescaleFactor)),
+ pack_pixels_RGBA8(color));
+}
+
+// Commit a single chunk of a YUV surface represented by multiple planar
+// textures. This requires a color space specifier selecting how to convert
+// from YUV to RGB output. In the case of HDR formats, a rescaling factor
+// selects how many bits of precision must be utilized on conversion. See the
+// sampleYUV dispatcher functions for the various supported plane
+// configurations this intrinsic accepts.
+#define swgl_commitTextureLinearYUV(...) \
+ swgl_commitChunk(RGBA8, sampleYUV(__VA_ARGS__))
+// Commit a single chunk of a YUV surface scaled by a color.
+#define swgl_commitTextureLinearColorYUV(...) \
+ swgl_commitChunk(RGBA8, sampleColorYUV(__VA_ARGS__))
+
+// Helper functions to apply a color modulus when available.
+struct NoColor {};
+
+SI WideRGBA8 applyColor(WideRGBA8 src, NoColor) { return src; }
+
+SI WideRGBA8 applyColor(WideRGBA8 src, WideRGBA8 color) {
+ return muldiv255(src, color);
+}
+
+SI PackedRGBA8 applyColor(PackedRGBA8 src, NoColor) { return src; }
+
+SI PackedRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) {
+ return pack(muldiv255(unpack(src), color));
+}
+
+// Samples an axis-aligned span of on a single row of a texture using 1:1
+// nearest filtering. Sampling is constrained to only fall within the given UV
+// bounds. This requires a pointer to the destination buffer. An optional color
+// modulus can be supplied.
+template <typename S, typename C>
+static void blendTextureNearestRGBA8(S sampler, const ivec2_scalar& i, int span,
+ const ivec2_scalar& minUV,
+ const ivec2_scalar& maxUV, C color,
+ uint32_t* buf, int layerOffset = 0) {
+ // Calculate the row pointer within the buffer, clamping to within valid row
+ // bounds.
+ uint32_t* row =
+ &sampler->buf[clamp(clampCoord(i.y, sampler->height), minUV.y, maxUV.y) *
+ sampler->stride +
+ layerOffset];
+ // Find clamped X bounds within the row.
+ int minX = clamp(minUV.x, 0, sampler->width - 1);
+ int maxX = clamp(maxUV.x, minX, sampler->width - 1);
+ int curX = i.x;
+ // If we need to start sampling below the valid sample bounds, then we need to
+ // fill this section with a constant clamped sample.
+ if (curX < minX) {
+ int n = min(minX - curX, span);
+ auto src = applyColor(unpack(bit_cast<PackedRGBA8>(U32(row[minX]))), color);
+ commit_solid_span(buf, src, n);
+ buf += n;
+ span -= n;
+ curX += n;
+ }
+ // Here we only deal with valid samples within the sample bounds. No clamping
+ // should occur here within these inner loops.
+ int n = clamp(maxX + 1 - curX, 0, span);
+ span -= n;
+ // Try to process as many chunks as possible with full loads and stores.
+ if (blend_key) {
+ for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) {
+ auto src =
+ applyColor(unpack(unaligned_load<PackedRGBA8>(&row[curX])), color);
+ auto r = blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), src);
+ unaligned_store(buf, pack(r));
+ }
+ } else {
+ for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) {
+ auto src = applyColor(unaligned_load<PackedRGBA8>(&row[curX]), color);
+ unaligned_store(buf, src);
+ }
+ }
+ n &= 3;
+ // If we have any leftover samples after processing chunks, use partial loads
+ // and stores.
+ if (n > 0) {
+ if (blend_key) {
+ auto src = applyColor(
+ unpack(partial_load_span<PackedRGBA8>(&row[curX], n)), color);
+ auto r =
+ blend_pixels(buf, partial_load_span<PackedRGBA8>(buf, n), src, n);
+ partial_store_span(buf, pack(r), n);
+ } else {
+ auto src =
+ applyColor(partial_load_span<PackedRGBA8>(&row[curX], n), color);
+ partial_store_span(buf, src, n);
+ }
+ buf += n;
+ curX += n;
+ }
+ // If we still have samples left above the valid sample bounds, then we again
+ // need to fill this section with a constant clamped sample.
+ if (span > 0) {
+ auto src = applyColor(unpack(bit_cast<PackedRGBA8>(U32(row[maxX]))), color);
+ commit_solid_span(buf, src, span);
+ }
+}
+
+// TODO: blendTextureNearestR8 if it is actually needed
+
+// Commit an entire span of 1:1 nearest texture fetches, potentially scaled by a
+// color
+#define swgl_commitTextureNearest(format, s, p, uv_rect, color, ...) \
+ do { \
+ ivec2_scalar i = make_ivec2(samplerScale(s, force_scalar(p))); \
+ ivec2_scalar min_uv = \
+ make_ivec2(samplerScale(s, vec2_scalar{uv_rect.x, uv_rect.y})); \
+ ivec2_scalar max_uv = \
+ make_ivec2(samplerScale(s, vec2_scalar{uv_rect.z, uv_rect.w})); \
+ blendTextureNearest##format(s, i, swgl_SpanLength, min_uv, max_uv, color, \
+ swgl_Out##format, __VA_ARGS__); \
+ swgl_Out##format += swgl_SpanLength; \
+ swgl_SpanLength = 0; \
+ } while (0)
+#define swgl_commitTextureNearestRGBA8(s, p, uv_rect, ...) \
+ swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor(), __VA_ARGS__)
+#define swgl_commitTextureNearestR8(s, p, uv_rect, ...) \
+ swgl_commitTextureNearest(R8, s, p, uv_rect, NoColor(), __VA_ARGS__)
+
+#define swgl_commitTextureNearestColor(format, s, p, uv_rect, color, ...) \
+ swgl_commitTextureNearest(format, s, p, uv_rect, \
+ pack_pixels_##format(color), __VA_ARGS__)
+#define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color, ...) \
+ swgl_commitTextureNearestColor(RGBA8, s, p, uv_rect, color, __VA_ARGS__)
+#define swgl_commitTextureNearestColorR8(s, p, uv_rect, color, ...) \
+ swgl_commitTextureNearestColor(R8, s, p, uv_rect, color, __VA_ARGS__)
+
+// Helper function to decide whether we can safely apply 1:1 nearest filtering
+// without diverging too much from the linear filter
+template <typename S, typename T>
+static bool allowTextureNearest(S sampler, T P, int span) {
+ // First verify if the row Y doesn't change across samples
+ if (P.y.x != P.y.y) {
+ return false;
+ }
+ P = samplerScale(sampler, P);
+ // We need to verify that the pixel step reasonably approximates stepping
+ // by a single texel for every pixel we need to reproduce. Try to ensure
+ // that the margin of error is no more than approximately 2^-7.
+ span &= ~(128 - 1);
+ span += 128;
+ return round((P.x.y - P.x.x) * span) == span &&
+ // Also verify that we're reasonably close to the center of a texel
+ // so that it doesn't look that much different than if a linear filter
+ // was used.
+ (int(P.x.x * 4.0f + 0.5f) & 3) == 2 &&
+ (int(P.y.x * 4.0f + 0.5f) & 3) == 2;
+}
+
+// Determine if we can apply 1:1 nearest filtering to a span of texture
+#define swgl_allowTextureNearest(s, p) \
+ allowTextureNearest(s, p, swgl_SpanLength)
+
+// Extension to set a clip mask image to be sampled during blending. The offset
+// specifies the positioning of the clip mask image relative to the viewport
+// origin. The bounding box specifies the rectangle relative to the clip mask's
+// origin that constrains sampling within the clip mask.
+static sampler2D swgl_ClipMask = nullptr;
+static IntPoint swgl_ClipMaskOffset = {0, 0};
+static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0};
+#define swgl_clipMask(mask, offset, bb_origin, bb_size) \
+ do { \
+ if (bb_size != vec2_scalar(0.0f, 0.0f)) { \
+ swgl_ClipMask = mask; \
+ swgl_ClipMaskOffset = make_ivec2(offset); \
+ swgl_ClipMaskBounds = \
+ IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \
+ } \
+ } while (0)
+
+// Dispatch helper used by the GLSL translator to swgl_drawSpan functions.
+// The number of pixels committed is tracked by checking for the difference in
+// swgl_SpanLength. Any varying interpolants used will be advanced past the
+// committed part of the span in case the fragment shader must be executed for
+// any remaining pixels that were not committed by the span shader.
+#define DISPATCH_DRAW_SPAN(self, format) \
+ do { \
+ int total = self->swgl_SpanLength; \
+ self->swgl_drawSpan##format(); \
+ int drawn = total - self->swgl_SpanLength; \
+ if (drawn) self->step_interp_inputs(drawn); \
+ while (self->swgl_SpanLength > 0) { \
+ run(self); \
+ commit_span(self->swgl_Out##format, pack_span(self->swgl_Out##format)); \
+ self->swgl_Out##format += swgl_StepSize; \
+ self->swgl_SpanLength -= swgl_StepSize; \
+ } \
+ } while (0)
diff --git a/gfx/wr/swgl/src/swgl_fns.rs b/gfx/wr/swgl/src/swgl_fns.rs
new file mode 100644
index 0000000000..21bfc21e84
--- /dev/null
+++ b/gfx/wr/swgl/src/swgl_fns.rs
@@ -0,0 +1,2490 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#![allow(unused_variables)]
+
+use gleam::gl::*;
+use std::ffi::{CStr, CString};
+use std::os::raw::{c_char, c_int, c_void};
+use std::ptr;
+use std::str;
+
+#[allow(unused)]
+macro_rules! debug {
+ ($($x:tt)*) => {};
+}
+
+#[repr(C)]
+struct LockedTexture { _private: [u8; 0] }
+
+extern "C" {
+ fn ActiveTexture(texture: GLenum);
+ fn BindTexture(target: GLenum, texture: GLuint);
+ fn BindBuffer(target: GLenum, buffer: GLuint);
+ fn BindVertexArray(vao: GLuint);
+ fn BindFramebuffer(target: GLenum, fb: GLuint);
+ fn BindRenderbuffer(target: GLenum, rb: GLuint);
+ fn BlendFunc(srgb: GLenum, drgb: GLenum, sa: GLenum, da: GLenum);
+ fn BlendColor(r: GLfloat, g: GLfloat, b: GLfloat, a: GLfloat);
+ fn BlendEquation(mode: GLenum);
+ fn Enable(cap: GLenum);
+ fn Disable(cap: GLenum);
+ fn GenQueries(n: GLsizei, result: *mut GLuint);
+ fn BeginQuery(target: GLenum, id: GLuint);
+ fn EndQuery(target: GLenum);
+ fn GetQueryObjectui64v(id: GLuint, pname: GLenum, params: *mut GLuint64);
+ fn GenBuffers(n: i32, result: *mut GLuint);
+ fn GenTextures(n: i32, result: *mut GLuint);
+ fn GenFramebuffers(n: i32, result: *mut GLuint);
+ fn GenRenderbuffers(n: i32, result: *mut GLuint);
+ fn BufferData(target: GLenum, size: GLsizeiptr, data: *const GLvoid, usage: GLenum);
+ fn BufferSubData(target: GLenum, offset: GLintptr, size: GLsizeiptr, data: *const GLvoid);
+ fn MapBuffer(target: GLenum, access: GLbitfield) -> *mut c_void;
+ fn MapBufferRange(
+ target: GLenum,
+ offset: GLintptr,
+ length: GLsizeiptr,
+ access: GLbitfield,
+ ) -> *mut c_void;
+ fn UnmapBuffer(target: GLenum) -> GLboolean;
+ fn TexStorage2D(
+ target: GLenum,
+ levels: GLint,
+ internal_format: GLenum,
+ width: GLsizei,
+ height: GLsizei,
+ );
+ fn FramebufferTexture2D(
+ target: GLenum,
+ attachment: GLenum,
+ textarget: GLenum,
+ texture: GLuint,
+ level: GLint,
+ );
+ fn CheckFramebufferStatus(target: GLenum) -> GLenum;
+ fn InvalidateFramebuffer(
+ target: GLenum,
+ num_attachments: GLsizei,
+ attachments: *const GLenum,
+ );
+ fn TexStorage3D(
+ target: GLenum,
+ levels: GLint,
+ internal_format: GLenum,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ );
+ fn TexImage2D(
+ target: GLenum,
+ level: GLint,
+ internal_format: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ border: GLint,
+ format: GLenum,
+ ty: GLenum,
+ data: *const c_void,
+ );
+ fn TexImage3D(
+ target: GLenum,
+ level: GLint,
+ internal_format: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ border: GLint,
+ format: GLenum,
+ ty: GLenum,
+ data: *const c_void,
+ );
+ fn TexSubImage2D(
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ format: GLenum,
+ ty: GLenum,
+ data: *const c_void,
+ );
+ fn TexSubImage3D(
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ zoffset: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ format: GLenum,
+ ty: GLenum,
+ data: *const c_void,
+ );
+ fn GenerateMipmap(target: GLenum);
+ fn GetUniformLocation(program: GLuint, name: *const GLchar) -> GLint;
+ fn BindAttribLocation(program: GLuint, index: GLuint, name: *const GLchar);
+ fn GetAttribLocation(program: GLuint, name: *const GLchar) -> GLint;
+ fn GenVertexArrays(n: i32, result: *mut GLuint);
+ fn VertexAttribPointer(
+ index: GLuint,
+ size: GLint,
+ type_: GLenum,
+ normalized: GLboolean,
+ stride: GLsizei,
+ offset: *const GLvoid,
+ );
+ fn VertexAttribIPointer(
+ index: GLuint,
+ size: GLint,
+ type_: GLenum,
+ stride: GLsizei,
+ offset: *const GLvoid,
+ );
+ fn CreateShader(shader_type: GLenum) -> GLuint;
+ fn AttachShader(program: GLuint, shader: GLuint);
+ fn CreateProgram() -> GLuint;
+ fn Uniform1i(location: GLint, v0: GLint);
+ fn Uniform4fv(location: GLint, count: GLsizei, value: *const GLfloat);
+ fn UniformMatrix4fv(
+ location: GLint,
+ count: GLsizei,
+ transpose: GLboolean,
+ value: *const GLfloat,
+ );
+ fn DrawElementsInstanced(
+ mode: GLenum,
+ count: GLsizei,
+ type_: GLenum,
+ indices: GLintptr,
+ instancecount: GLsizei,
+ );
+ fn EnableVertexAttribArray(index: GLuint);
+ fn VertexAttribDivisor(index: GLuint, divisor: GLuint);
+ fn LinkProgram(program: GLuint);
+ fn GetLinkStatus(program: GLuint) -> GLint;
+ fn UseProgram(program: GLuint);
+ fn SetViewport(x: GLint, y: GLint, width: GLsizei, height: GLsizei);
+ fn FramebufferTextureLayer(
+ target: GLenum,
+ attachment: GLenum,
+ texture: GLuint,
+ level: GLint,
+ layer: GLint,
+ );
+ fn FramebufferRenderbuffer(
+ target: GLenum,
+ attachment: GLenum,
+ renderbuffertarget: GLenum,
+ renderbuffer: GLuint,
+ );
+ fn RenderbufferStorage(target: GLenum, internalformat: GLenum, width: GLsizei, height: GLsizei);
+ fn DepthMask(flag: GLboolean);
+ fn DepthFunc(func: GLenum);
+ fn SetScissor(x: GLint, y: GLint, width: GLsizei, height: GLsizei);
+ fn ClearColor(r: GLfloat, g: GLfloat, b: GLfloat, a: GLfloat);
+ fn ClearDepth(depth: GLdouble);
+ fn Clear(mask: GLbitfield);
+ fn PixelStorei(name: GLenum, param: GLint);
+ fn ReadPixels(
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ format: GLenum,
+ ty: GLenum,
+ data: *mut c_void,
+ );
+ fn Finish();
+ fn ShaderSourceByName(shader: GLuint, name: *const GLchar);
+ fn TexParameteri(target: GLenum, pname: GLenum, param: GLint);
+ fn CopyImageSubData(
+ src_name: GLuint,
+ src_target: GLenum,
+ src_level: GLint,
+ src_x: GLint,
+ src_y: GLint,
+ src_z: GLint,
+ dst_name: GLuint,
+ dst_target: GLenum,
+ dst_level: GLint,
+ dst_x: GLint,
+ dst_y: GLint,
+ dst_z: GLint,
+ src_width: GLsizei,
+ src_height: GLsizei,
+ src_depth: GLsizei,
+ );
+ fn CopyTexSubImage2D(
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ );
+ fn CopyTexSubImage3D(
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ zoffset: GLint,
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ );
+ fn BlitFramebuffer(
+ src_x0: GLint,
+ src_y0: GLint,
+ src_x1: GLint,
+ src_y1: GLint,
+ dst_x0: GLint,
+ dst_y0: GLint,
+ dst_x1: GLint,
+ dst_y1: GLint,
+ mask: GLbitfield,
+ filter: GLenum,
+ );
+ fn GetIntegerv(pname: GLenum, params: *mut GLint);
+ fn GetBooleanv(pname: GLenum, params: *mut GLboolean);
+ fn GetString(name: GLenum) -> *const c_char;
+ fn GetStringi(name: GLenum, index: GLuint) -> *const c_char;
+ fn GetError() -> GLenum;
+ fn InitDefaultFramebuffer(
+ x: i32,
+ y: i32,
+ width: i32,
+ height: i32,
+ stride: i32,
+ buf: *mut c_void,
+ );
+ fn GetColorBuffer(
+ fbo: GLuint,
+ flush: GLboolean,
+ width: *mut i32,
+ height: *mut i32,
+ stride: *mut i32,
+ ) -> *mut c_void;
+ fn SetTextureBuffer(
+ tex: GLuint,
+ internal_format: GLenum,
+ width: GLsizei,
+ height: GLsizei,
+ stride: GLsizei,
+ buf: *mut c_void,
+ min_width: GLsizei,
+ min_height: GLsizei,
+ );
+ fn SetTextureParameter(tex: GLuint, pname: GLenum, param: GLint);
+ fn DeleteTexture(n: GLuint);
+ fn DeleteRenderbuffer(n: GLuint);
+ fn DeleteFramebuffer(n: GLuint);
+ fn DeleteBuffer(n: GLuint);
+ fn DeleteVertexArray(n: GLuint);
+ fn DeleteQuery(n: GLuint);
+ fn DeleteShader(shader: GLuint);
+ fn DeleteProgram(program: GLuint);
+ fn LockFramebuffer(fbo: GLuint) -> *mut LockedTexture;
+ fn LockTexture(tex: GLuint) -> *mut LockedTexture;
+ fn LockResource(resource: *mut LockedTexture);
+ fn UnlockResource(resource: *mut LockedTexture);
+ fn GetResourceBuffer(
+ resource: *mut LockedTexture,
+ width: *mut i32,
+ height: *mut i32,
+ stride: *mut i32,
+ ) -> *mut c_void;
+ fn Composite(
+ locked_dst: *mut LockedTexture,
+ locked_src: *mut LockedTexture,
+ src_x: GLint,
+ src_y: GLint,
+ src_width: GLsizei,
+ src_height: GLsizei,
+ dst_x: GLint,
+ dst_y: GLint,
+ dst_width: GLsizei,
+ dst_height: GLsizei,
+ opaque: GLboolean,
+ flip: GLboolean,
+ filter: GLenum,
+ clip_x: GLint,
+ clip_y: GLint,
+ clip_width: GLsizei,
+ clip_height: GLsizei,
+ );
+ fn CompositeYUV(
+ locked_dst: *mut LockedTexture,
+ locked_y: *mut LockedTexture,
+ locked_u: *mut LockedTexture,
+ locked_v: *mut LockedTexture,
+ color_space: YUVColorSpace,
+ color_depth: GLuint,
+ src_x: GLint,
+ src_y: GLint,
+ src_width: GLsizei,
+ src_height: GLsizei,
+ dst_x: GLint,
+ dst_y: GLint,
+ dst_width: GLsizei,
+ dst_height: GLsizei,
+ flip: GLboolean,
+ clip_x: GLint,
+ clip_y: GLint,
+ clip_width: GLsizei,
+ clip_height: GLsizei,
+ );
+ fn CreateContext() -> *mut c_void;
+ fn ReferenceContext(ctx: *mut c_void);
+ fn DestroyContext(ctx: *mut c_void);
+ fn MakeCurrent(ctx: *mut c_void);
+}
+
+#[derive(Clone, Copy)]
+pub struct Context(*mut c_void);
+
+impl Context {
+ pub fn create() -> Self {
+ Context(unsafe { CreateContext() })
+ }
+
+ pub fn reference(&self) {
+ unsafe {
+ ReferenceContext(self.0);
+ }
+ }
+
+ pub fn destroy(&self) {
+ unsafe {
+ DestroyContext(self.0);
+ }
+ }
+
+ pub fn make_current(&self) {
+ unsafe {
+ MakeCurrent(self.0);
+ }
+ }
+
+ pub fn init_default_framebuffer(
+ &self,
+ x: i32,
+ y: i32,
+ width: i32,
+ height: i32,
+ stride: i32,
+ buf: *mut c_void,
+ ) {
+ unsafe {
+ InitDefaultFramebuffer(x, y, width, height, stride, buf);
+ }
+ }
+
+ pub fn get_color_buffer(&self, fbo: GLuint, flush: bool) -> (*mut c_void, i32, i32, i32) {
+ unsafe {
+ let mut width: i32 = 0;
+ let mut height: i32 = 0;
+ let mut stride: i32 = 0;
+ let data_ptr = GetColorBuffer(fbo, flush as GLboolean, &mut width, &mut height, &mut stride);
+ (data_ptr, width, height, stride)
+ }
+ }
+
+ pub fn set_texture_buffer(
+ &self,
+ tex: GLuint,
+ internal_format: GLenum,
+ width: GLsizei,
+ height: GLsizei,
+ stride: GLsizei,
+ buf: *mut c_void,
+ min_width: GLsizei,
+ min_height: GLsizei,
+ ) {
+ unsafe {
+ SetTextureBuffer(
+ tex,
+ internal_format,
+ width,
+ height,
+ stride,
+ buf,
+ min_width,
+ min_height,
+ );
+ }
+ }
+
+ pub fn set_texture_parameter(&self, tex: GLuint, pname: GLenum, param: GLint) {
+ unsafe {
+ SetTextureParameter(tex, pname, param);
+ }
+ }
+
+ pub fn lock_framebuffer(&self, fbo: GLuint) -> Option<LockedResource> {
+ unsafe {
+ let resource = LockFramebuffer(fbo);
+ if resource != ptr::null_mut() {
+ Some(LockedResource(resource))
+ } else {
+ None
+ }
+ }
+ }
+
+ pub fn lock_texture(&self, tex: GLuint) -> Option<LockedResource> {
+ unsafe {
+ let resource = LockTexture(tex);
+ if resource != ptr::null_mut() {
+ Some(LockedResource(resource))
+ } else {
+ None
+ }
+ }
+ }
+}
+
+impl From<*mut c_void> for Context {
+ fn from(ptr: *mut c_void) -> Self {
+ Context(ptr)
+ }
+}
+
+impl From<Context> for *mut c_void {
+ fn from(ctx: Context) -> Self {
+ ctx.0
+ }
+}
+
+fn calculate_length(width: GLsizei, height: GLsizei, format: GLenum, pixel_type: GLenum) -> usize {
+ let colors = match format {
+ RED => 1,
+ RGB => 3,
+ BGR => 3,
+
+ RGBA => 4,
+ BGRA => 4,
+
+ ALPHA => 1,
+ R16 => 1,
+ LUMINANCE => 1,
+ DEPTH_COMPONENT => 1,
+ _ => panic!("unsupported format for read_pixels: {:?}", format),
+ };
+ let depth = match pixel_type {
+ UNSIGNED_BYTE => 1,
+ UNSIGNED_SHORT => 2,
+ SHORT => 2,
+ FLOAT => 4,
+ UNSIGNED_INT_8_8_8_8_REV => 1,
+ _ => panic!("unsupported pixel_type for read_pixels: {:?}", pixel_type),
+ };
+
+ return (width * height * colors * depth) as usize;
+}
+
+impl Gl for Context {
+ fn get_type(&self) -> GlType {
+ GlType::Gl
+ }
+
+ fn buffer_data_untyped(
+ &self,
+ target: GLenum,
+ size: GLsizeiptr,
+ data: *const GLvoid,
+ usage: GLenum,
+ ) {
+ debug!(
+ "buffer_data_untyped {} {} {:?} {}",
+ target, size, data, usage
+ );
+ //panic!();
+ unsafe {
+ BufferData(target, size, data, usage);
+ }
+ }
+
+ fn buffer_sub_data_untyped(
+ &self,
+ target: GLenum,
+ offset: isize,
+ size: GLsizeiptr,
+ data: *const GLvoid,
+ ) {
+ debug!(
+ "buffer_sub_data_untyped {} {} {} {:?}",
+ target, offset, size, data
+ );
+ //panic!();
+ unsafe {
+ BufferSubData(target, offset, size, data);
+ }
+ }
+
+ fn map_buffer(&self, target: GLenum, access: GLbitfield) -> *mut c_void {
+ unsafe { MapBuffer(target, access) }
+ }
+
+ fn map_buffer_range(
+ &self,
+ target: GLenum,
+ offset: GLintptr,
+ length: GLsizeiptr,
+ access: GLbitfield,
+ ) -> *mut c_void {
+ unsafe { MapBufferRange(target, offset, length, access) }
+ }
+
+ fn unmap_buffer(&self, target: GLenum) -> GLboolean {
+ unsafe { UnmapBuffer(target) }
+ }
+
+ fn shader_source(&self, shader: GLuint, strings: &[&[u8]]) {
+ //panic!();
+ debug!("shader_source {}", shader);
+ //for s in strings {
+ // debug!("{}", str::from_utf8(s).unwrap());
+ //}
+ //panic!();
+ for s in strings {
+ let u = str::from_utf8(s).unwrap();
+ const PREFIX: &'static str = "// shader: ";
+ if let Some(start) = u.find(PREFIX) {
+ if let Some(end) = u[start ..].find('\n') {
+ let name = u[start + PREFIX.len() .. start + end].trim();
+ debug!("shader name: {}", name);
+ unsafe {
+ let c_string = CString::new(name).unwrap();
+ ShaderSourceByName(shader, c_string.as_ptr());
+ return;
+ }
+ }
+ }
+ }
+ panic!("unknown shader");
+ }
+
+ fn tex_buffer(&self, target: GLenum, internal_format: GLenum, buffer: GLuint) {
+ panic!();
+ }
+
+ fn read_buffer(&self, mode: GLenum) {
+ panic!();
+ }
+
+ fn read_pixels_into_buffer(
+ &self,
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ format: GLenum,
+ pixel_type: GLenum,
+ dst_buffer: &mut [u8],
+ ) {
+ // Assumes that the user properly allocated the size for dst_buffer.
+ assert!(calculate_length(width, height, format, pixel_type) == dst_buffer.len());
+
+ unsafe {
+ ReadPixels(
+ x,
+ y,
+ width,
+ height,
+ format,
+ pixel_type,
+ dst_buffer.as_mut_ptr() as *mut c_void,
+ );
+ }
+ }
+
+ fn read_pixels(
+ &self,
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ format: GLenum,
+ pixel_type: GLenum,
+ ) -> Vec<u8> {
+ let len = calculate_length(width, height, format, pixel_type);
+ let mut pixels: Vec<u8> = Vec::new();
+ pixels.reserve(len);
+ unsafe {
+ pixels.set_len(len);
+ }
+
+ self.read_pixels_into_buffer(
+ x,
+ y,
+ width,
+ height,
+ format,
+ pixel_type,
+ pixels.as_mut_slice(),
+ );
+
+ pixels
+ }
+
+ unsafe fn read_pixels_into_pbo(
+ &self,
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ format: GLenum,
+ pixel_type: GLenum,
+ ) {
+ ReadPixels(x, y, width, height, format, pixel_type, ptr::null_mut());
+ }
+
+ fn sample_coverage(&self, value: GLclampf, invert: bool) {
+ panic!();
+ }
+
+ fn polygon_offset(&self, factor: GLfloat, units: GLfloat) {
+ panic!();
+ }
+
+ fn pixel_store_i(&self, name: GLenum, param: GLint) {
+ //panic!();
+ debug!("pixel_store_i {:x} {}", name, param);
+ unsafe {
+ PixelStorei(name, param);
+ }
+ }
+
+ fn gen_buffers(&self, n: GLsizei) -> Vec<GLuint> {
+ //panic!();
+ let mut result = vec![0 as GLuint; n as usize];
+ unsafe {
+ GenBuffers(n, result.as_mut_ptr());
+ }
+ result
+ }
+
+ fn gen_renderbuffers(&self, n: GLsizei) -> Vec<GLuint> {
+ debug!("gen_renderbuffers {}", n);
+ //panic!();
+ let mut result = vec![0 as GLuint; n as usize];
+ unsafe {
+ GenRenderbuffers(n, result.as_mut_ptr());
+ }
+ result
+ }
+
+ fn gen_framebuffers(&self, n: GLsizei) -> Vec<GLuint> {
+ //panic!();
+ debug!("gen_framebuffers {}", n);
+ let mut result = vec![0 as GLuint; n as usize];
+ unsafe {
+ GenFramebuffers(n, result.as_mut_ptr());
+ }
+ result
+ }
+
+ fn gen_textures(&self, n: GLsizei) -> Vec<GLuint> {
+ //panic!();
+ let mut result = vec![0 as GLuint; n as usize];
+ unsafe {
+ GenTextures(n, result.as_mut_ptr());
+ }
+ result
+ }
+
+ fn gen_vertex_arrays(&self, n: GLsizei) -> Vec<GLuint> {
+ //panic!();
+ let mut result = vec![0 as GLuint; n as usize];
+ unsafe {
+ GenVertexArrays(n, result.as_mut_ptr());
+ }
+ result
+ }
+
+ fn gen_vertex_arrays_apple(&self, n: GLsizei) -> Vec<GLuint> {
+ self.gen_vertex_arrays(n)
+ }
+
+ fn gen_queries(&self, n: GLsizei) -> Vec<GLuint> {
+ let mut result = vec![0 as GLuint; n as usize];
+ unsafe {
+ GenQueries(n, result.as_mut_ptr());
+ }
+ result
+ }
+
+ fn begin_query(&self, target: GLenum, id: GLuint) {
+ unsafe {
+ BeginQuery(target, id);
+ }
+ }
+
+ fn end_query(&self, target: GLenum) {
+ unsafe {
+ EndQuery(target);
+ }
+ }
+
+ fn query_counter(&self, id: GLuint, target: GLenum) {
+ panic!();
+ }
+
+ fn get_query_object_iv(&self, id: GLuint, pname: GLenum) -> i32 {
+ panic!();
+ //0
+ }
+
+ fn get_query_object_uiv(&self, id: GLuint, pname: GLenum) -> u32 {
+ panic!();
+ //0
+ }
+
+ fn get_query_object_i64v(&self, id: GLuint, pname: GLenum) -> i64 {
+ panic!();
+ //0
+ }
+
+ fn get_query_object_ui64v(&self, id: GLuint, pname: GLenum) -> u64 {
+ let mut result = 0;
+ unsafe {
+ GetQueryObjectui64v(id, pname, &mut result);
+ }
+ result
+ }
+
+ fn delete_queries(&self, queries: &[GLuint]) {
+ unsafe {
+ for q in queries {
+ DeleteQuery(*q);
+ }
+ }
+ }
+
+ fn delete_vertex_arrays(&self, vertex_arrays: &[GLuint]) {
+ unsafe {
+ for v in vertex_arrays {
+ DeleteVertexArray(*v);
+ }
+ }
+ }
+
+ fn delete_vertex_arrays_apple(&self, vertex_arrays: &[GLuint]) {
+ self.delete_vertex_arrays(vertex_arrays)
+ }
+
+ fn delete_buffers(&self, buffers: &[GLuint]) {
+ unsafe {
+ for b in buffers {
+ DeleteBuffer(*b);
+ }
+ }
+ }
+
+ fn delete_renderbuffers(&self, renderbuffers: &[GLuint]) {
+ unsafe {
+ for r in renderbuffers {
+ DeleteRenderbuffer(*r);
+ }
+ }
+ }
+
+ fn delete_framebuffers(&self, framebuffers: &[GLuint]) {
+ unsafe {
+ for f in framebuffers {
+ DeleteFramebuffer(*f);
+ }
+ }
+ }
+
+ fn delete_textures(&self, textures: &[GLuint]) {
+ unsafe {
+ for t in textures {
+ DeleteTexture(*t);
+ }
+ }
+ }
+
+ fn framebuffer_renderbuffer(
+ &self,
+ target: GLenum,
+ attachment: GLenum,
+ renderbuffertarget: GLenum,
+ renderbuffer: GLuint,
+ ) {
+ debug!(
+ "framebufer_renderbuffer {} {} {} {}",
+ target, attachment, renderbuffertarget, renderbuffer
+ );
+ //panic!();
+ unsafe {
+ FramebufferRenderbuffer(target, attachment, renderbuffertarget, renderbuffer);
+ }
+ }
+
+ fn renderbuffer_storage(
+ &self,
+ target: GLenum,
+ internalformat: GLenum,
+ width: GLsizei,
+ height: GLsizei,
+ ) {
+ debug!(
+ "renderbuffer_storage {} {} {} {}",
+ target, internalformat, width, height
+ );
+ //panic!();
+ unsafe {
+ RenderbufferStorage(target, internalformat, width, height);
+ }
+ }
+
+ fn depth_func(&self, func: GLenum) {
+ debug!("depth_func {}", func);
+ //panic!();
+ unsafe {
+ DepthFunc(func);
+ }
+ }
+
+ fn active_texture(&self, texture: GLenum) {
+ //panic!();
+ unsafe {
+ ActiveTexture(texture);
+ }
+ }
+
+ fn attach_shader(&self, program: GLuint, shader: GLuint) {
+ debug!("attach shader {} {}", program, shader);
+ //panic!();
+ unsafe {
+ AttachShader(program, shader);
+ }
+ }
+
+ fn bind_attrib_location(&self, program: GLuint, index: GLuint, name: &str) {
+ debug!("bind_attrib_location {} {} {}", program, index, name);
+ //panic!();
+ let c_string = CString::new(name).unwrap();
+ unsafe { BindAttribLocation(program, index, c_string.as_ptr()) }
+ }
+
+ // https://www.khronos.org/registry/OpenGL-Refpages/es2.0/xhtml/glGetUniform.xml
+ unsafe fn get_uniform_iv(&self, program: GLuint, location: GLint, result: &mut [GLint]) {
+ panic!();
+ //assert!(!result.is_empty());
+ }
+
+ // https://www.khronos.org/registry/OpenGL-Refpages/es2.0/xhtml/glGetUniform.xml
+ unsafe fn get_uniform_fv(&self, program: GLuint, location: GLint, result: &mut [GLfloat]) {
+ panic!();
+ //assert!(!result.is_empty());
+ }
+
+ fn get_uniform_block_index(&self, program: GLuint, name: &str) -> GLuint {
+ panic!();
+ //0
+ }
+
+ fn get_uniform_indices(&self, program: GLuint, names: &[&str]) -> Vec<GLuint> {
+ panic!();
+ //Vec::new()
+ }
+
+ fn bind_buffer_base(&self, target: GLenum, index: GLuint, buffer: GLuint) {
+ panic!();
+ }
+
+ fn bind_buffer_range(
+ &self,
+ target: GLenum,
+ index: GLuint,
+ buffer: GLuint,
+ offset: GLintptr,
+ size: GLsizeiptr,
+ ) {
+ panic!();
+ }
+
+ fn uniform_block_binding(
+ &self,
+ program: GLuint,
+ uniform_block_index: GLuint,
+ uniform_block_binding: GLuint,
+ ) {
+ panic!();
+ }
+
+ fn bind_buffer(&self, target: GLenum, buffer: GLuint) {
+ //panic!();
+ unsafe {
+ BindBuffer(target, buffer);
+ }
+ }
+
+ fn bind_vertex_array(&self, vao: GLuint) {
+ //panic!();
+ unsafe {
+ BindVertexArray(vao);
+ }
+ }
+
+ fn bind_vertex_array_apple(&self, vao: GLuint) {
+ self.bind_vertex_array(vao)
+ }
+
+ fn bind_renderbuffer(&self, target: GLenum, renderbuffer: GLuint) {
+ debug!("bind_renderbuffer {} {}", target, renderbuffer);
+ //panic!();
+ unsafe {
+ BindRenderbuffer(target, renderbuffer);
+ }
+ }
+
+ fn bind_framebuffer(&self, target: GLenum, framebuffer: GLuint) {
+ debug!("bind_framebuffer {} {}", target, framebuffer);
+ //panic!();
+ unsafe {
+ BindFramebuffer(target, framebuffer);
+ }
+ }
+
+ fn bind_texture(&self, target: GLenum, texture: GLuint) {
+ //panic!();
+ unsafe {
+ BindTexture(target, texture);
+ }
+ }
+
+ fn draw_buffers(&self, bufs: &[GLenum]) {
+ panic!();
+ //unsafe {}
+ }
+
+ // FIXME: Does not verify buffer size -- unsafe!
+ fn tex_image_2d(
+ &self,
+ target: GLenum,
+ level: GLint,
+ internal_format: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ border: GLint,
+ format: GLenum,
+ ty: GLenum,
+ opt_data: Option<&[u8]>,
+ ) {
+ unsafe {
+ let pdata = match opt_data {
+ Some(data) => data.as_ptr() as *const GLvoid,
+ None => ptr::null(),
+ };
+ TexImage2D(
+ target,
+ level,
+ internal_format,
+ width,
+ height,
+ border,
+ format,
+ ty,
+ pdata,
+ );
+ }
+ }
+
+ fn compressed_tex_image_2d(
+ &self,
+ target: GLenum,
+ level: GLint,
+ internal_format: GLenum,
+ width: GLsizei,
+ height: GLsizei,
+ border: GLint,
+ data: &[u8],
+ ) {
+ panic!();
+ }
+
+ fn compressed_tex_sub_image_2d(
+ &self,
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ format: GLenum,
+ data: &[u8],
+ ) {
+ panic!();
+ }
+
+ // FIXME: Does not verify buffer size -- unsafe!
+ fn tex_image_3d(
+ &self,
+ target: GLenum,
+ level: GLint,
+ internal_format: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ border: GLint,
+ format: GLenum,
+ ty: GLenum,
+ opt_data: Option<&[u8]>,
+ ) {
+ unsafe {
+ let pdata = match opt_data {
+ Some(data) => data.as_ptr() as *const GLvoid,
+ None => ptr::null(),
+ };
+ TexImage3D(
+ target,
+ level,
+ internal_format,
+ width,
+ height,
+ depth,
+ border,
+ format,
+ ty,
+ pdata,
+ );
+ }
+ }
+
+ fn copy_tex_image_2d(
+ &self,
+ target: GLenum,
+ level: GLint,
+ internal_format: GLenum,
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ border: GLint,
+ ) {
+ panic!();
+ }
+
+ fn copy_tex_sub_image_2d(
+ &self,
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ ) {
+ unsafe {
+ CopyTexSubImage2D(target, level, xoffset, yoffset, x, y, width, height);
+ }
+ }
+
+ fn copy_tex_sub_image_3d(
+ &self,
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ zoffset: GLint,
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ ) {
+ unsafe {
+ CopyTexSubImage3D(
+ target, level, xoffset, yoffset, zoffset, x, y, width, height,
+ );
+ }
+ }
+
+ fn tex_sub_image_2d(
+ &self,
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ format: GLenum,
+ ty: GLenum,
+ data: &[u8],
+ ) {
+ debug!(
+ "tex_sub_image_2d {} {} {} {} {} {} {} {}",
+ target, level, xoffset, yoffset, width, height, format, ty
+ );
+ //panic!();
+ unsafe {
+ TexSubImage2D(
+ target,
+ level,
+ xoffset,
+ yoffset,
+ width,
+ height,
+ format,
+ ty,
+ data.as_ptr() as *const c_void,
+ );
+ }
+ }
+
+ fn tex_sub_image_2d_pbo(
+ &self,
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ format: GLenum,
+ ty: GLenum,
+ offset: usize,
+ ) {
+ debug!(
+ "tex_sub_image_2d_pbo {} {} {} {} {} {} {} {} {}",
+ target, level, xoffset, yoffset, width, height, format, ty, offset
+ );
+ //panic!();
+ unsafe {
+ TexSubImage2D(
+ target,
+ level,
+ xoffset,
+ yoffset,
+ width,
+ height,
+ format,
+ ty,
+ offset as *const c_void,
+ );
+ }
+ }
+
+ fn tex_sub_image_3d(
+ &self,
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ zoffset: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ format: GLenum,
+ ty: GLenum,
+ data: &[u8],
+ ) {
+ debug!("tex_sub_image_3d");
+ //panic!();
+ unsafe {
+ TexSubImage3D(
+ target,
+ level,
+ xoffset,
+ yoffset,
+ zoffset,
+ width,
+ height,
+ depth,
+ format,
+ ty,
+ data.as_ptr() as *const c_void,
+ );
+ }
+ }
+
+ fn tex_sub_image_3d_pbo(
+ &self,
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ zoffset: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ format: GLenum,
+ ty: GLenum,
+ offset: usize,
+ ) {
+ unsafe {
+ TexSubImage3D(
+ target,
+ level,
+ xoffset,
+ yoffset,
+ zoffset,
+ width,
+ height,
+ depth,
+ format,
+ ty,
+ offset as *const c_void,
+ );
+ }
+ }
+
+ fn tex_storage_2d(
+ &self,
+ target: GLenum,
+ levels: GLint,
+ internal_format: GLenum,
+ width: GLsizei,
+ height: GLsizei,
+ ) {
+ //panic!();
+ unsafe {
+ TexStorage2D(target, levels, internal_format, width, height);
+ }
+ }
+
+ fn tex_storage_3d(
+ &self,
+ target: GLenum,
+ levels: GLint,
+ internal_format: GLenum,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ ) {
+ //panic!();
+ unsafe {
+ TexStorage3D(target, levels, internal_format, width, height, depth);
+ }
+ }
+
+ fn get_tex_image_into_buffer(
+ &self,
+ target: GLenum,
+ level: GLint,
+ format: GLenum,
+ ty: GLenum,
+ output: &mut [u8],
+ ) {
+ panic!();
+ }
+
+ unsafe fn copy_image_sub_data(
+ &self,
+ src_name: GLuint,
+ src_target: GLenum,
+ src_level: GLint,
+ src_x: GLint,
+ src_y: GLint,
+ src_z: GLint,
+ dst_name: GLuint,
+ dst_target: GLenum,
+ dst_level: GLint,
+ dst_x: GLint,
+ dst_y: GLint,
+ dst_z: GLint,
+ src_width: GLsizei,
+ src_height: GLsizei,
+ src_depth: GLsizei,
+ ) {
+ CopyImageSubData(
+ src_name, src_target, src_level, src_x, src_y, src_z, dst_name, dst_target, dst_level,
+ dst_x, dst_y, dst_z, src_width, src_height, src_depth,
+ );
+ }
+
+ fn invalidate_framebuffer(&self, target: GLenum, attachments: &[GLenum]) {
+ unsafe {
+ InvalidateFramebuffer(target, attachments.len() as GLsizei, attachments.as_ptr());
+ }
+ }
+
+ fn invalidate_sub_framebuffer(
+ &self,
+ target: GLenum,
+ attachments: &[GLenum],
+ xoffset: GLint,
+ yoffset: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ ) {
+ }
+
+ #[inline]
+ unsafe fn get_integer_v(&self, name: GLenum, result: &mut [GLint]) {
+ //panic!();
+ assert!(!result.is_empty());
+ GetIntegerv(name, result.as_mut_ptr());
+ }
+
+ #[inline]
+ unsafe fn get_integer_64v(&self, name: GLenum, result: &mut [GLint64]) {
+ panic!();
+ //assert!(!result.is_empty());
+ }
+
+ #[inline]
+ unsafe fn get_integer_iv(&self, name: GLenum, index: GLuint, result: &mut [GLint]) {
+ panic!();
+ //assert!(!result.is_empty());
+ }
+
+ #[inline]
+ unsafe fn get_integer_64iv(&self, name: GLenum, index: GLuint, result: &mut [GLint64]) {
+ panic!();
+ //assert!(!result.is_empty());
+ }
+
+ #[inline]
+ unsafe fn get_boolean_v(&self, name: GLenum, result: &mut [GLboolean]) {
+ debug!("get_boolean_v {}", name);
+ //panic!();
+ assert!(!result.is_empty());
+ GetBooleanv(name, result.as_mut_ptr());
+ }
+
+ #[inline]
+ unsafe fn get_float_v(&self, name: GLenum, result: &mut [GLfloat]) {
+ panic!();
+ //assert!(!result.is_empty());
+ }
+
+ fn get_framebuffer_attachment_parameter_iv(
+ &self,
+ target: GLenum,
+ attachment: GLenum,
+ pname: GLenum,
+ ) -> GLint {
+ panic!();
+ //0
+ }
+
+ fn get_renderbuffer_parameter_iv(&self, target: GLenum, pname: GLenum) -> GLint {
+ panic!();
+ //0
+ }
+
+ fn get_tex_parameter_iv(&self, target: GLenum, pname: GLenum) -> GLint {
+ panic!();
+ //0
+ }
+
+ fn get_tex_parameter_fv(&self, target: GLenum, pname: GLenum) -> GLfloat {
+ panic!();
+ //0.0
+ }
+
+ fn tex_parameter_i(&self, target: GLenum, pname: GLenum, param: GLint) {
+ //panic!();
+ unsafe {
+ TexParameteri(target, pname, param);
+ }
+ }
+
+ fn tex_parameter_f(&self, target: GLenum, pname: GLenum, param: GLfloat) {
+ panic!();
+ }
+
+ fn framebuffer_texture_2d(
+ &self,
+ target: GLenum,
+ attachment: GLenum,
+ textarget: GLenum,
+ texture: GLuint,
+ level: GLint,
+ ) {
+ debug!(
+ "framebuffer_texture_2d {} {} {} {} {}",
+ target, attachment, textarget, texture, level
+ );
+ //panic!();
+ unsafe {
+ FramebufferTexture2D(target, attachment, textarget, texture, level);
+ }
+ }
+
+ fn framebuffer_texture_layer(
+ &self,
+ target: GLenum,
+ attachment: GLenum,
+ texture: GLuint,
+ level: GLint,
+ layer: GLint,
+ ) {
+ debug!(
+ "framebuffer_texture_layer {} {} {} {} {}",
+ target, attachment, texture, level, layer
+ );
+ //panic!();
+ unsafe {
+ FramebufferTextureLayer(target, attachment, texture, level, layer);
+ }
+ }
+
+ fn blit_framebuffer(
+ &self,
+ src_x0: GLint,
+ src_y0: GLint,
+ src_x1: GLint,
+ src_y1: GLint,
+ dst_x0: GLint,
+ dst_y0: GLint,
+ dst_x1: GLint,
+ dst_y1: GLint,
+ mask: GLbitfield,
+ filter: GLenum,
+ ) {
+ unsafe {
+ BlitFramebuffer(
+ src_x0, src_y0, src_x1, src_y1, dst_x0, dst_y0, dst_x1, dst_y1, mask, filter,
+ );
+ }
+ }
+
+ fn vertex_attrib_4f(&self, index: GLuint, x: GLfloat, y: GLfloat, z: GLfloat, w: GLfloat) {
+ panic!();
+ }
+
+ fn vertex_attrib_pointer_f32(
+ &self,
+ index: GLuint,
+ size: GLint,
+ normalized: bool,
+ stride: GLsizei,
+ offset: GLuint,
+ ) {
+ panic!();
+ }
+
+ fn vertex_attrib_pointer(
+ &self,
+ index: GLuint,
+ size: GLint,
+ type_: GLenum,
+ normalized: bool,
+ stride: GLsizei,
+ offset: GLuint,
+ ) {
+ debug!(
+ "vertex_attrib_pointer {} {} {} {} {} {}",
+ index, size, type_, normalized, stride, offset
+ );
+ //panic!();
+ unsafe {
+ VertexAttribPointer(
+ index,
+ size,
+ type_,
+ normalized as GLboolean,
+ stride,
+ offset as *const GLvoid,
+ );
+ }
+ }
+
+ fn vertex_attrib_i_pointer(
+ &self,
+ index: GLuint,
+ size: GLint,
+ type_: GLenum,
+ stride: GLsizei,
+ offset: GLuint,
+ ) {
+ debug!(
+ "vertex_attrib_i_pointer {} {} {} {} {}",
+ index, size, type_, stride, offset
+ );
+ //panic!();
+ unsafe {
+ VertexAttribIPointer(index, size, type_, stride, offset as *const GLvoid);
+ }
+ }
+
+ fn vertex_attrib_divisor(&self, index: GLuint, divisor: GLuint) {
+ debug!("vertex_attrib_divisor {} {}", index, divisor);
+ //assert!(index == 0 && divisor == 0);
+ //panic!();
+ unsafe {
+ VertexAttribDivisor(index, divisor);
+ }
+ }
+
+ fn viewport(&self, x: GLint, y: GLint, width: GLsizei, height: GLsizei) {
+ debug!("viewport {} {} {} {}", x, y, width, height);
+ //panic!();
+ unsafe {
+ SetViewport(x, y, width, height);
+ }
+ }
+
+ fn scissor(&self, x: GLint, y: GLint, width: GLsizei, height: GLsizei) {
+ //panic!();
+ unsafe {
+ SetScissor(x, y, width, height);
+ }
+ }
+
+ fn line_width(&self, width: GLfloat) {
+ panic!();
+ }
+
+ fn use_program(&self, program: GLuint) {
+ //panic!();
+ unsafe {
+ UseProgram(program);
+ }
+ }
+
+ fn validate_program(&self, program: GLuint) {
+ panic!();
+ }
+
+ fn draw_arrays(&self, mode: GLenum, first: GLint, count: GLsizei) {
+ unsafe {
+ DrawElementsInstanced(
+ mode,
+ count,
+ NONE,
+ first as GLintptr,
+ 1,
+ );
+ }
+ }
+
+ fn draw_arrays_instanced(
+ &self,
+ mode: GLenum,
+ first: GLint,
+ count: GLsizei,
+ primcount: GLsizei,
+ ) {
+ unsafe {
+ DrawElementsInstanced(
+ mode,
+ count,
+ NONE,
+ first as GLintptr,
+ primcount,
+ );
+ }
+ }
+
+ fn draw_elements(
+ &self,
+ mode: GLenum,
+ count: GLsizei,
+ element_type: GLenum,
+ indices_offset: GLuint,
+ ) {
+ debug!(
+ "draw_elements {} {} {} {} {}",
+ mode, count, element_type, indices_offset
+ );
+ //panic!();
+ unsafe {
+ DrawElementsInstanced(
+ mode,
+ count,
+ element_type,
+ indices_offset as GLintptr,
+ 1,
+ );
+ }
+ }
+
+ fn draw_elements_instanced(
+ &self,
+ mode: GLenum,
+ count: GLsizei,
+ element_type: GLenum,
+ indices_offset: GLuint,
+ primcount: GLsizei,
+ ) {
+ debug!(
+ "draw_elements_instanced {} {} {} {} {}",
+ mode, count, element_type, indices_offset, primcount
+ );
+ //panic!();
+ unsafe {
+ DrawElementsInstanced(
+ mode,
+ count,
+ element_type,
+ indices_offset as GLintptr,
+ primcount,
+ );
+ }
+ }
+
+ fn blend_color(&self, r: f32, g: f32, b: f32, a: f32) {
+ unsafe {
+ BlendColor(r, g, b, a);
+ }
+ }
+
+ fn blend_func(&self, sfactor: GLenum, dfactor: GLenum) {
+ unsafe {
+ BlendFunc(sfactor, dfactor, sfactor, dfactor);
+ }
+ }
+
+ fn blend_func_separate(
+ &self,
+ src_rgb: GLenum,
+ dest_rgb: GLenum,
+ src_alpha: GLenum,
+ dest_alpha: GLenum,
+ ) {
+ unsafe {
+ BlendFunc(src_rgb, dest_rgb, src_alpha, dest_alpha);
+ }
+ }
+
+ fn blend_equation(&self, mode: GLenum) {
+ unsafe {
+ BlendEquation(mode);
+ }
+ }
+
+ fn blend_equation_separate(&self, mode_rgb: GLenum, mode_alpha: GLenum) {
+ panic!();
+ }
+
+ fn color_mask(&self, r: bool, g: bool, b: bool, a: bool) {
+ panic!();
+ }
+
+ fn cull_face(&self, mode: GLenum) {
+ panic!();
+ }
+
+ fn front_face(&self, mode: GLenum) {
+ panic!();
+ }
+
+ fn enable(&self, cap: GLenum) {
+ debug!("enable {}", cap);
+ //panic!();
+ unsafe {
+ Enable(cap);
+ }
+ }
+
+ fn disable(&self, cap: GLenum) {
+ debug!("disable {}", cap);
+ //panic!();
+ unsafe {
+ Disable(cap);
+ }
+ }
+
+ fn hint(&self, param_name: GLenum, param_val: GLenum) {
+ panic!();
+ }
+
+ fn is_enabled(&self, cap: GLenum) -> GLboolean {
+ panic!();
+ //0
+ }
+
+ fn is_shader(&self, shader: GLuint) -> GLboolean {
+ panic!();
+ //0
+ }
+
+ fn is_texture(&self, texture: GLenum) -> GLboolean {
+ panic!();
+ //0
+ }
+
+ fn is_framebuffer(&self, framebuffer: GLenum) -> GLboolean {
+ panic!();
+ //0
+ }
+
+ fn is_renderbuffer(&self, renderbuffer: GLenum) -> GLboolean {
+ panic!();
+ //0
+ }
+
+ fn check_frame_buffer_status(&self, target: GLenum) -> GLenum {
+ debug!("check_frame_buffer_status {}", target);
+ //panic!();
+ unsafe { CheckFramebufferStatus(target) }
+ }
+
+ fn enable_vertex_attrib_array(&self, index: GLuint) {
+ //panic!();
+ debug!("enable_vertex_attrib_array {}", index);
+ unsafe {
+ EnableVertexAttribArray(index);
+ //assert_eq!(index, 0);
+ }
+ }
+
+ fn disable_vertex_attrib_array(&self, index: GLuint) {
+ panic!();
+ }
+
+ fn uniform_1f(&self, location: GLint, v0: GLfloat) {
+ panic!();
+ }
+
+ fn uniform_1fv(&self, location: GLint, values: &[f32]) {
+ panic!();
+ }
+
+ fn uniform_1i(&self, location: GLint, v0: GLint) {
+ debug!("uniform_1i {} {}", location, v0);
+ //panic!();
+ unsafe {
+ Uniform1i(location, v0);
+ }
+ }
+
+ fn uniform_1iv(&self, location: GLint, values: &[i32]) {
+ panic!();
+ }
+
+ fn uniform_1ui(&self, location: GLint, v0: GLuint) {
+ panic!();
+ }
+
+ fn uniform_2f(&self, location: GLint, v0: GLfloat, v1: GLfloat) {
+ panic!();
+ }
+
+ fn uniform_2fv(&self, location: GLint, values: &[f32]) {
+ panic!();
+ }
+
+ fn uniform_2i(&self, location: GLint, v0: GLint, v1: GLint) {
+ panic!();
+ }
+
+ fn uniform_2iv(&self, location: GLint, values: &[i32]) {
+ panic!();
+ }
+
+ fn uniform_2ui(&self, location: GLint, v0: GLuint, v1: GLuint) {
+ panic!();
+ }
+
+ fn uniform_3f(&self, location: GLint, v0: GLfloat, v1: GLfloat, v2: GLfloat) {
+ panic!();
+ }
+
+ fn uniform_3fv(&self, location: GLint, values: &[f32]) {
+ panic!();
+ }
+
+ fn uniform_3i(&self, location: GLint, v0: GLint, v1: GLint, v2: GLint) {
+ panic!();
+ }
+
+ fn uniform_3iv(&self, location: GLint, values: &[i32]) {
+ panic!();
+ }
+
+ fn uniform_3ui(&self, location: GLint, v0: GLuint, v1: GLuint, v2: GLuint) {
+ panic!();
+ }
+
+ fn uniform_4f(&self, location: GLint, x: GLfloat, y: GLfloat, z: GLfloat, w: GLfloat) {
+ panic!();
+ }
+
+ fn uniform_4i(&self, location: GLint, x: GLint, y: GLint, z: GLint, w: GLint) {
+ panic!();
+ }
+
+ fn uniform_4iv(&self, location: GLint, values: &[i32]) {
+ panic!();
+ }
+
+ fn uniform_4ui(&self, location: GLint, x: GLuint, y: GLuint, z: GLuint, w: GLuint) {
+ panic!();
+ }
+
+ fn uniform_4fv(&self, location: GLint, values: &[f32]) {
+ unsafe {
+ Uniform4fv(location, (values.len() / 4) as GLsizei, values.as_ptr());
+ }
+ }
+
+ fn uniform_matrix_2fv(&self, location: GLint, transpose: bool, value: &[f32]) {
+ panic!();
+ }
+
+ fn uniform_matrix_3fv(&self, location: GLint, transpose: bool, value: &[f32]) {
+ panic!();
+ }
+
+ fn uniform_matrix_4fv(&self, location: GLint, transpose: bool, value: &[f32]) {
+ debug!("uniform_matrix_4fv {} {} {:?}", location, transpose, value);
+ //panic!();
+ unsafe {
+ UniformMatrix4fv(
+ location,
+ (value.len() / 16) as GLsizei,
+ transpose as GLboolean,
+ value.as_ptr(),
+ );
+ }
+ }
+
+ fn depth_mask(&self, flag: bool) {
+ debug!("depth_mask {}", flag);
+ //panic!();
+ unsafe {
+ DepthMask(flag as GLboolean);
+ }
+ }
+
+ fn depth_range(&self, near: f64, far: f64) {
+ panic!();
+ }
+
+ fn get_active_attrib(&self, program: GLuint, index: GLuint) -> (i32, u32, String) {
+ panic!();
+ //(0, 0, String::new())
+ }
+
+ fn get_active_uniform(&self, program: GLuint, index: GLuint) -> (i32, u32, String) {
+ panic!();
+ //(0, 0, String::new())
+ }
+
+ fn get_active_uniforms_iv(
+ &self,
+ program: GLuint,
+ indices: Vec<GLuint>,
+ pname: GLenum,
+ ) -> Vec<GLint> {
+ panic!();
+ //Vec::new()
+ }
+
+ fn get_active_uniform_block_i(&self, program: GLuint, index: GLuint, pname: GLenum) -> GLint {
+ panic!();
+ //0
+ }
+
+ fn get_active_uniform_block_iv(
+ &self,
+ program: GLuint,
+ index: GLuint,
+ pname: GLenum,
+ ) -> Vec<GLint> {
+ panic!();
+ //Vec::new()
+ }
+
+ fn get_active_uniform_block_name(&self, program: GLuint, index: GLuint) -> String {
+ panic!();
+ //String::new()
+ }
+
+ fn get_attrib_location(&self, program: GLuint, name: &str) -> c_int {
+ let name = CString::new(name).unwrap();
+ unsafe { GetAttribLocation(program, name.as_ptr()) }
+ }
+
+ fn get_frag_data_location(&self, program: GLuint, name: &str) -> c_int {
+ panic!();
+ //0
+ }
+
+ fn get_uniform_location(&self, program: GLuint, name: &str) -> c_int {
+ debug!("get_uniform_location {} {}", program, name);
+ //panic!();
+ let name = CString::new(name).unwrap();
+ unsafe { GetUniformLocation(program, name.as_ptr()) }
+ }
+
+ fn get_program_info_log(&self, program: GLuint) -> String {
+ debug!("get_program_info_log {}", program);
+ String::new()
+ }
+
+ #[inline]
+ unsafe fn get_program_iv(&self, program: GLuint, pname: GLenum, result: &mut [GLint]) {
+ debug!("get_program_iv {}", pname);
+ //panic!();
+ assert!(!result.is_empty());
+ //#define GL_LINK_STATUS 0x8B82
+ if pname == 0x8b82 {
+ result[0] = GetLinkStatus(program);
+ }
+ }
+
+ fn get_program_binary(&self, program: GLuint) -> (Vec<u8>, GLenum) {
+ panic!();
+ //(Vec::new(), NONE)
+ }
+
+ fn program_binary(&self, program: GLuint, format: GLenum, binary: &[u8]) {
+ panic!();
+ }
+
+ fn program_parameter_i(&self, program: GLuint, pname: GLenum, value: GLint) {
+ panic!();
+ }
+
+ #[inline]
+ unsafe fn get_vertex_attrib_iv(&self, index: GLuint, pname: GLenum, result: &mut [GLint]) {
+ panic!();
+ //assert!(!result.is_empty());
+ }
+
+ #[inline]
+ unsafe fn get_vertex_attrib_fv(&self, index: GLuint, pname: GLenum, result: &mut [GLfloat]) {
+ panic!();
+ //assert!(!result.is_empty());
+ }
+
+ fn get_vertex_attrib_pointer_v(&self, index: GLuint, pname: GLenum) -> GLsizeiptr {
+ panic!();
+ //0
+ }
+
+ fn get_buffer_parameter_iv(&self, target: GLuint, pname: GLenum) -> GLint {
+ panic!();
+ //0
+ }
+
+ fn get_shader_info_log(&self, shader: GLuint) -> String {
+ debug!("get_shader_info_log {}", shader);
+ //panic!();
+ String::new()
+ }
+
+ fn get_string(&self, which: GLenum) -> String {
+ // panic!();
+ unsafe {
+ let llstr = GetString(which);
+ if !llstr.is_null() {
+ return str::from_utf8_unchecked(CStr::from_ptr(llstr).to_bytes()).to_string();
+ } else {
+ return "".to_string();
+ }
+ }
+ }
+
+ fn get_string_i(&self, which: GLenum, index: GLuint) -> String {
+ //panic!();
+ unsafe {
+ let llstr = GetStringi(which, index);
+ if !llstr.is_null() {
+ str::from_utf8_unchecked(CStr::from_ptr(llstr).to_bytes()).to_string()
+ } else {
+ "".to_string()
+ }
+ }
+ }
+
+ unsafe fn get_shader_iv(&self, shader: GLuint, pname: GLenum, result: &mut [GLint]) {
+ debug!("get_shader_iv");
+ //panic!();
+ assert!(!result.is_empty());
+ if pname == 0x8B81
+ /*gl::COMPILE_STATUS*/
+ {
+ result[0] = 1;
+ }
+ }
+
+ fn get_shader_precision_format(
+ &self,
+ _shader_type: GLuint,
+ precision_type: GLuint,
+ ) -> (GLint, GLint, GLint) {
+ // gl.GetShaderPrecisionFormat is not available until OpenGL 4.1.
+ // Fallback to OpenGL standard precissions that most desktop hardware support.
+ match precision_type {
+ LOW_FLOAT | MEDIUM_FLOAT | HIGH_FLOAT => {
+ // Fallback to IEEE 754 single precision
+ // Range: from -2^127 to 2^127
+ // Significand precision: 23 bits
+ (127, 127, 23)
+ }
+ LOW_INT | MEDIUM_INT | HIGH_INT => {
+ // Fallback to single precision integer
+ // Range: from -2^24 to 2^24
+ // Precision: For integer formats this value is always 0
+ (24, 24, 0)
+ }
+ _ => (0, 0, 0),
+ }
+ }
+
+ fn compile_shader(&self, shader: GLuint) {
+ debug!("compile_shader {}", shader);
+ //panic!();
+ }
+
+ fn create_program(&self) -> GLuint {
+ debug!("create_program");
+ //panic!();
+ unsafe { CreateProgram() }
+ }
+
+ fn delete_program(&self, program: GLuint) {
+ unsafe {
+ DeleteProgram(program);
+ }
+ }
+
+ fn create_shader(&self, shader_type: GLenum) -> GLuint {
+ debug!("create_shader {}", shader_type);
+ //panic!();
+ unsafe { CreateShader(shader_type) }
+ }
+
+ fn delete_shader(&self, shader: GLuint) {
+ debug!("delete_shader {}", shader);
+ //panic!();
+ unsafe {
+ DeleteShader(shader);
+ }
+ }
+
+ fn detach_shader(&self, program: GLuint, shader: GLuint) {
+ debug!("detach_shader {} {}", program, shader);
+ //panic!();
+ }
+
+ fn link_program(&self, program: GLuint) {
+ debug!("link_program {}", program);
+ //panic!();
+ unsafe {
+ LinkProgram(program);
+ }
+ }
+
+ fn clear_color(&self, r: f32, g: f32, b: f32, a: f32) {
+ //panic!();
+ unsafe {
+ ClearColor(r, g, b, a);
+ }
+ }
+
+ fn clear(&self, buffer_mask: GLbitfield) {
+ debug!("clear {}", buffer_mask);
+ //panic!();
+ unsafe {
+ Clear(buffer_mask);
+ }
+ }
+
+ fn clear_depth(&self, depth: f64) {
+ debug!("clear_depth {}", depth);
+ //panic!();
+ unsafe {
+ ClearDepth(depth as GLclampd);
+ }
+ }
+
+ fn clear_stencil(&self, s: GLint) {
+ panic!();
+ }
+
+ fn flush(&self) {}
+
+ fn finish(&self) {
+ unsafe {
+ Finish();
+ }
+ }
+
+ fn get_error(&self) -> GLenum {
+ //panic!();
+ unsafe { GetError() }
+ }
+
+ fn stencil_mask(&self, mask: GLuint) {
+ panic!();
+ }
+
+ fn stencil_mask_separate(&self, face: GLenum, mask: GLuint) {
+ panic!();
+ }
+
+ fn stencil_func(&self, func: GLenum, ref_: GLint, mask: GLuint) {
+ panic!();
+ }
+
+ fn stencil_func_separate(&self, face: GLenum, func: GLenum, ref_: GLint, mask: GLuint) {
+ panic!();
+ }
+
+ fn stencil_op(&self, sfail: GLenum, dpfail: GLenum, dppass: GLenum) {
+ panic!();
+ }
+
+ fn stencil_op_separate(&self, face: GLenum, sfail: GLenum, dpfail: GLenum, dppass: GLenum) {
+ panic!();
+ }
+
+ fn egl_image_target_texture2d_oes(&self, target: GLenum, image: GLeglImageOES) {
+ panic!("not supported")
+ }
+
+ fn egl_image_target_renderbuffer_storage_oes(&self, target: GLenum, image: GLeglImageOES) {
+ panic!("not supported")
+ }
+
+ fn generate_mipmap(&self, target: GLenum) {
+ unsafe {
+ GenerateMipmap(target);
+ }
+ }
+
+ fn insert_event_marker_ext(&self, message: &str) {
+ panic!();
+ }
+
+ fn push_group_marker_ext(&self, message: &str) {
+ debug!("push group {}", message);
+ panic!();
+ }
+
+ fn pop_group_marker_ext(&self) {
+ debug!("pop group");
+ panic!();
+ }
+
+ fn debug_message_insert_khr(
+ &self,
+ source: GLenum,
+ type_: GLenum,
+ id: GLuint,
+ severity: GLenum,
+ message: &str,
+ ) {
+ panic!();
+ }
+
+ fn push_debug_group_khr(&self, source: GLenum, id: GLuint, message: &str) {
+ panic!();
+ }
+
+ fn pop_debug_group_khr(&self) {
+ panic!();
+ }
+
+ fn fence_sync(&self, condition: GLenum, flags: GLbitfield) -> GLsync {
+ panic!();
+ //ptr::null()
+ }
+
+ fn client_wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) -> GLenum {
+ panic!();
+ }
+
+ fn wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) {
+ panic!();
+ }
+
+ fn texture_range_apple(&self, target: GLenum, data: &[u8]) {
+ panic!();
+ }
+
+ fn delete_sync(&self, sync: GLsync) {
+ panic!();
+ }
+
+ fn gen_fences_apple(&self, n: GLsizei) -> Vec<GLuint> {
+ panic!();
+ //Vec::new()
+ }
+
+ fn delete_fences_apple(&self, fences: &[GLuint]) {
+ panic!();
+ }
+
+ fn set_fence_apple(&self, fence: GLuint) {
+ panic!();
+ }
+
+ fn finish_fence_apple(&self, fence: GLuint) {
+ panic!();
+ }
+
+ fn test_fence_apple(&self, fence: GLuint) {
+ panic!();
+ }
+
+ fn test_object_apple(&self, object: GLenum, name: GLuint) -> GLboolean {
+ panic!();
+ //0
+ }
+
+ fn finish_object_apple(&self, object: GLenum, name: GLuint) {
+ panic!();
+ }
+
+ // GL_ARB_blend_func_extended
+ fn bind_frag_data_location_indexed(
+ &self,
+ program: GLuint,
+ color_number: GLuint,
+ index: GLuint,
+ name: &str,
+ ) {
+ panic!();
+ }
+
+ fn get_frag_data_index(&self, program: GLuint, name: &str) -> GLint {
+ panic!();
+ //-1
+ }
+
+ // GL_KHR_debug
+ fn get_debug_messages(&self) -> Vec<DebugMessage> {
+ Vec::new()
+ }
+
+ fn provoking_vertex_angle(&self, _mode: GLenum) {
+ unimplemented!("This extension is GLES only");
+ }
+
+ // GL_KHR_blend_equation_advanced
+ fn blend_barrier_khr(&self) {
+ panic!();
+ }
+
+ // GL_CHROMIUM_copy_texture
+ fn copy_texture_chromium(
+ &self,
+ _source_id: GLuint,
+ _source_level: GLint,
+ _dest_target: GLenum,
+ _dest_id: GLuint,
+ _dest_level: GLint,
+ _internal_format: GLint,
+ _dest_type: GLenum,
+ _unpack_flip_y: GLboolean,
+ _unpack_premultiply_alpha: GLboolean,
+ _unpack_unmultiply_alpha: GLboolean,
+ ) {
+ unimplemented!("This extension is GLES only");
+ }
+ fn copy_sub_texture_chromium(
+ &self,
+ _source_id: GLuint,
+ _source_level: GLint,
+ _dest_target: GLenum,
+ _dest_id: GLuint,
+ _dest_level: GLint,
+ _x_offset: GLint,
+ _y_offset: GLint,
+ _x: GLint,
+ _y: GLint,
+ _width: GLsizei,
+ _height: GLsizei,
+ _unpack_flip_y: GLboolean,
+ _unpack_premultiply_alpha: GLboolean,
+ _unpack_unmultiply_alpha: GLboolean,
+ ) {
+ unimplemented!("This extension is GLES only");
+ }
+
+ // GL_ANGLE_copy_texture_3d
+ fn copy_texture_3d_angle(
+ &self,
+ _source_id: GLuint,
+ _source_level: GLint,
+ _dest_target: GLenum,
+ _dest_id: GLuint,
+ _dest_level: GLint,
+ _internal_format: GLint,
+ _dest_type: GLenum,
+ _unpack_flip_y: GLboolean,
+ _unpack_premultiply_alpha: GLboolean,
+ _unpack_unmultiply_alpha: GLboolean,
+ ) {
+ unimplemented!("Not supported by SWGL");
+ }
+
+ fn copy_sub_texture_3d_angle(
+ &self,
+ _source_id: GLuint,
+ _source_level: GLint,
+ _dest_target: GLenum,
+ _dest_id: GLuint,
+ _dest_level: GLint,
+ _x_offset: GLint,
+ _y_offset: GLint,
+ _z_offset: GLint,
+ _x: GLint,
+ _y: GLint,
+ _z: GLint,
+ _width: GLsizei,
+ _height: GLsizei,
+ _depth: GLsizei,
+ _unpack_flip_y: GLboolean,
+ _unpack_premultiply_alpha: GLboolean,
+ _unpack_unmultiply_alpha: GLboolean,
+ ) {
+ unimplemented!("Not supported by SWGL");
+ }
+
+ fn buffer_storage(
+ &self,
+ target: GLenum,
+ size: GLsizeiptr,
+ data: *const GLvoid,
+ flags: GLbitfield,
+ ) {
+ unimplemented!("Not supported by SWGL");
+ }
+
+ fn flush_mapped_buffer_range(&self, target: GLenum, offset: GLintptr, length: GLsizeiptr) {
+ unimplemented!("Not supported by SWGL");
+ }
+}
+
+/// A resource that is intended for sharing between threads.
+/// Locked resources such as textures or framebuffers will
+/// not allow any further modifications while it remains
+/// locked. The resource will be unlocked when LockedResource
+/// is dropped.
+pub struct LockedResource(*mut LockedTexture);
+
+unsafe impl Send for LockedResource {}
+unsafe impl Sync for LockedResource {}
+
+#[repr(C)]
+pub enum YUVColorSpace {
+ Rec601 = 0,
+ Rec709,
+ Rec2020,
+ Identity,
+}
+
+impl LockedResource {
+ /// Composites from a locked resource to another locked resource. The band
+ /// offset and height are relative to the destination rectangle and specify
+ /// how to clip the composition into appropriate range for this band.
+ pub fn composite(
+ &self,
+ locked_src: &LockedResource,
+ src_x: GLint,
+ src_y: GLint,
+ src_width: GLsizei,
+ src_height: GLsizei,
+ dst_x: GLint,
+ dst_y: GLint,
+ dst_width: GLsizei,
+ dst_height: GLsizei,
+ opaque: bool,
+ flip: bool,
+ filter: GLenum,
+ clip_x: GLint,
+ clip_y: GLint,
+ clip_width: GLsizei,
+ clip_height: GLsizei,
+ ) {
+ unsafe {
+ Composite(
+ self.0,
+ locked_src.0,
+ src_x,
+ src_y,
+ src_width,
+ src_height,
+ dst_x,
+ dst_y,
+ dst_width,
+ dst_height,
+ opaque as GLboolean,
+ flip as GLboolean,
+ filter,
+ clip_x,
+ clip_y,
+ clip_width,
+ clip_height,
+ );
+ }
+ }
+
+ /// Composites from locked resources representing YUV planes
+ pub fn composite_yuv(
+ &self,
+ locked_y: &LockedResource,
+ locked_u: &LockedResource,
+ locked_v: &LockedResource,
+ color_space: YUVColorSpace,
+ color_depth: GLuint,
+ src_x: GLint,
+ src_y: GLint,
+ src_width: GLsizei,
+ src_height: GLsizei,
+ dst_x: GLint,
+ dst_y: GLint,
+ dst_width: GLsizei,
+ dst_height: GLsizei,
+ flip: bool,
+ clip_x: GLint,
+ clip_y: GLint,
+ clip_width: GLsizei,
+ clip_height: GLsizei,
+ ) {
+ unsafe {
+ CompositeYUV(
+ self.0,
+ locked_y.0,
+ locked_u.0,
+ locked_v.0,
+ color_space,
+ color_depth,
+ src_x,
+ src_y,
+ src_width,
+ src_height,
+ dst_x,
+ dst_y,
+ dst_width,
+ dst_height,
+ flip as GLboolean,
+ clip_x,
+ clip_y,
+ clip_width,
+ clip_height,
+ );
+ }
+ }
+
+ /// Get the underlying buffer for a locked resource
+ pub fn get_buffer(&self) -> (*mut c_void, i32, i32, i32) {
+ unsafe {
+ let mut width: i32 = 0;
+ let mut height: i32 = 0;
+ let mut stride: i32 = 0;
+ let data_ptr = GetResourceBuffer(self.0, &mut width, &mut height, &mut stride);
+ (data_ptr, width, height, stride)
+ }
+ }
+}
+
+impl Clone for LockedResource {
+ fn clone(&self) -> Self {
+ unsafe { LockResource(self.0); }
+ LockedResource(self.0)
+ }
+}
+
+impl Drop for LockedResource {
+ fn drop(&mut self) {
+ unsafe { UnlockResource(self.0); }
+ }
+}
+
diff --git a/gfx/wr/swgl/src/texture.h b/gfx/wr/swgl/src/texture.h
new file mode 100644
index 0000000000..9d4138b8c5
--- /dev/null
+++ b/gfx/wr/swgl/src/texture.h
@@ -0,0 +1,1262 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+namespace glsl {
+
+using PackedRGBA8 = V16<uint8_t>;
+using WideRGBA8 = V16<uint16_t>;
+using HalfRGBA8 = V8<uint16_t>;
+
+SI WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); }
+
+template <int N>
+UNUSED SI VectorType<uint8_t, N> genericPackWide(VectorType<uint16_t, N> p) {
+ typedef VectorType<uint8_t, N> packed_type;
+ // Generic conversions only mask off the low byte without actually clamping
+ // like a real pack. First force the word to all 1s if it overflows, and then
+ // add on the sign bit to cause it to roll over to 0 if it was negative.
+ p = (p | (p > 255)) + (p >> 15);
+ return CONVERT(p, packed_type);
+}
+
+SI PackedRGBA8 pack(WideRGBA8 p) {
+#if USE_SSE2
+ return _mm_packus_epi16(lowHalf(p), highHalf(p));
+#elif USE_NEON
+ return vcombine_u8(vqmovn_u16(lowHalf(p)), vqmovn_u16(highHalf(p)));
+#else
+ return genericPackWide(p);
+#endif
+}
+
+using PackedR8 = V4<uint8_t>;
+using WideR8 = V4<uint16_t>;
+
+SI WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); }
+
+SI PackedR8 pack(WideR8 p) {
+#if USE_SSE2
+ auto m = expand(p);
+ auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m));
+ return SHUFFLE(r, r, 0, 1, 2, 3);
+#elif USE_NEON
+ return lowHalf(bit_cast<V8<uint8_t>>(vqmovn_u16(expand(p))));
+#else
+ return genericPackWide(p);
+#endif
+}
+
+using PackedRG8 = V8<uint8_t>;
+using WideRG8 = V8<uint16_t>;
+
+SI PackedRG8 pack(WideRG8 p) {
+#if USE_SSE2
+ return lowHalf(bit_cast<V16<uint8_t>>(_mm_packus_epi16(p, p)));
+#elif USE_NEON
+ return bit_cast<V8<uint8_t>>(vqmovn_u16(p));
+#else
+ return genericPackWide(p);
+#endif
+}
+
+SI I32 clampCoord(I32 coord, int limit, int base = 0) {
+#if USE_SSE2
+ return _mm_min_epi16(_mm_max_epi16(coord, _mm_set1_epi32(base)),
+ _mm_set1_epi32(limit - 1));
+#else
+ return clamp(coord, base, limit - 1);
+#endif
+}
+
+SI int clampCoord(int coord, int limit, int base = 0) {
+ return min(max(coord, base), limit - 1);
+}
+
+template <typename T, typename S>
+SI T clamp2D(T P, S sampler) {
+ return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)};
+}
+
+template <typename T>
+SI T clamp2DArray(T P, sampler2DArray sampler) {
+ return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height),
+ clampCoord(P.z, sampler->depth)};
+}
+
+SI float to_float(uint32_t x) { return x * (1.f / 255.f); }
+
+SI vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ U32 pixels = {a, b, c, d};
+ return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF),
+ cast(pixels & 0xFF), cast(pixels >> 24)) *
+ (1.0f / 255.0f);
+}
+
+SI vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) {
+ return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y},
+ Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w});
+}
+
+SI ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) {
+ return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y},
+ I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w});
+}
+
+SI vec4_scalar pixel_to_vec4(uint32_t p) {
+ U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24};
+ Float f = cast(i) * (1.0f / 255.0f);
+ return vec4_scalar(f.x, f.y, f.z, f.w);
+}
+
+template <typename S>
+SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) {
+ return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y],
+ sampler->buf[offset.z], sampler->buf[offset.w]);
+}
+
+template <typename S>
+vec4 texelFetchRGBA8(S sampler, ivec2 P) {
+ I32 offset = P.x + P.y * sampler->stride;
+ return fetchOffsetsRGBA8(sampler, offset);
+}
+
+vec4 texelFetchRGBA8(sampler2DArray sampler, ivec3 P) {
+ assert(test_all(P.z == P.z.x));
+ I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride;
+ return fetchOffsetsRGBA8(sampler, offset);
+}
+
+template <typename S>
+SI Float fetchOffsetsR8(S sampler, I32 offset) {
+ U32 i = {
+ ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y],
+ ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]};
+ return cast(i) * (1.0f / 255.0f);
+}
+
+template <typename S>
+vec4 texelFetchR8(S sampler, ivec2 P) {
+ I32 offset = P.x + P.y * sampler->stride;
+ return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);
+}
+
+vec4 texelFetchR8(sampler2DArray sampler, ivec3 P) {
+ assert(test_all(P.z == P.z.x));
+ I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride;
+ return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);
+}
+
+template <typename S>
+SI vec4 fetchOffsetsRG8(S sampler, I32 offset) {
+ uint16_t* buf = (uint16_t*)sampler->buf;
+ U16 pixels = {buf[offset.x], buf[offset.y], buf[offset.z], buf[offset.w]};
+ Float r = CONVERT(pixels & 0xFF, Float) * (1.0f / 255.0f);
+ Float g = CONVERT(pixels >> 8, Float) * (1.0f / 255.0f);
+ return vec4(r, g, 0.0f, 1.0f);
+}
+
+template <typename S>
+vec4 texelFetchRG8(S sampler, ivec2 P) {
+ I32 offset = P.x + P.y * sampler->stride;
+ return fetchOffsetsRG8(sampler, offset);
+}
+
+vec4 texelFetchRG8(sampler2DArray sampler, ivec3 P) {
+ assert(test_all(P.z == P.z.x));
+ I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride;
+ return fetchOffsetsRG8(sampler, offset);
+}
+
+template <typename S>
+SI Float fetchOffsetsR16(S sampler, I32 offset) {
+ U32 i = {
+ ((uint16_t*)sampler->buf)[offset.x], ((uint16_t*)sampler->buf)[offset.y],
+ ((uint16_t*)sampler->buf)[offset.z], ((uint16_t*)sampler->buf)[offset.w]};
+ return cast(i) * (1.0f / 65535.0f);
+}
+
+template <typename S>
+vec4 texelFetchR16(S sampler, ivec2 P) {
+ I32 offset = P.x + P.y * sampler->stride;
+ return vec4(fetchOffsetsR16(sampler, offset), 0.0f, 0.0f, 1.0f);
+}
+
+vec4 texelFetchR16(sampler2DArray sampler, ivec3 P) {
+ assert(test_all(P.z == P.z.x));
+ I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride;
+ return vec4(fetchOffsetsR16(sampler, offset), 0.0f, 0.0f, 1.0f);
+}
+
+template <typename S>
+SI vec4 fetchOffsetsFloat(S sampler, I32 offset) {
+ return pixel_float_to_vec4(
+ *(Float*)&sampler->buf[offset.x], *(Float*)&sampler->buf[offset.y],
+ *(Float*)&sampler->buf[offset.z], *(Float*)&sampler->buf[offset.w]);
+}
+
+vec4 texelFetchFloat(sampler2D sampler, ivec2 P) {
+ I32 offset = P.x * 4 + P.y * sampler->stride;
+ return fetchOffsetsFloat(sampler, offset);
+}
+
+SI vec4 texelFetchFloat(sampler2DArray sampler, ivec3 P) {
+ assert(test_all(P.z == P.z.x));
+ I32 offset = P.x * 4 + P.y * sampler->stride + P.z.x * sampler->height_stride;
+ return fetchOffsetsFloat(sampler, offset);
+}
+
+template <typename S>
+SI vec4 fetchOffsetsYUV422(S sampler, I32 offset) {
+ // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.
+ // Offset is aligned to a chunk rather than a pixel, and selector specifies
+ // pixel within the chunk.
+ I32 selector = offset & 1;
+ offset &= ~1;
+ uint16_t* buf = (uint16_t*)sampler->buf;
+ U32 pixels = {*(uint32_t*)&buf[offset.x], *(uint32_t*)&buf[offset.y],
+ *(uint32_t*)&buf[offset.z], *(uint32_t*)&buf[offset.w]};
+ Float b = CONVERT((pixels >> 8) & 0xFF, Float) * (1.0f / 255.0f);
+ Float r = CONVERT((pixels >> 24), Float) * (1.0f / 255.0f);
+ Float g =
+ CONVERT(if_then_else(-selector, pixels >> 16, pixels) & 0xFF, Float) *
+ (1.0f / 255.0f);
+ return vec4(r, g, b, 1.0f);
+}
+
+template <typename S>
+vec4 texelFetchYUV422(S sampler, ivec2 P) {
+ I32 offset = P.x + P.y * sampler->stride;
+ return fetchOffsetsYUV422(sampler, offset);
+}
+
+vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ switch (sampler->format) {
+ case TextureFormat::RGBA32F:
+ return texelFetchFloat(sampler, P);
+ case TextureFormat::RGBA8:
+ return texelFetchRGBA8(sampler, P);
+ case TextureFormat::R8:
+ return texelFetchR8(sampler, P);
+ case TextureFormat::RG8:
+ return texelFetchRG8(sampler, P);
+ case TextureFormat::R16:
+ return texelFetchR16(sampler, P);
+ case TextureFormat::YUV422:
+ return texelFetchYUV422(sampler, P);
+ default:
+ assert(false);
+ return vec4();
+ }
+}
+
+vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return texelFetchFloat(sampler, P);
+}
+
+vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA8);
+ return texelFetchRGBA8(sampler, P);
+}
+
+vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::R8);
+ return texelFetchR8(sampler, P);
+}
+
+vec4 texelFetch(sampler2DRG8 sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RG8);
+ return texelFetchRG8(sampler, P);
+}
+
+vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ if (sampler->format == TextureFormat::RGBA32F) {
+ return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+ } else {
+ assert(sampler->format == TextureFormat::RGBA8);
+ return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
+ }
+}
+
+vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA8);
+ return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
+}
+
+vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::R8);
+ return vec4_scalar{
+ to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f,
+ 0.0f, 1.0f};
+}
+
+vec4_scalar texelFetch(sampler2DRG8 sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RG8);
+ uint16_t pixel = ((uint16_t*)sampler->buf)[P.x + P.y * sampler->stride];
+ return vec4_scalar{to_float(pixel & 0xFF), to_float(pixel >> 8), 0.0f, 1.0f};
+}
+
+vec4 texelFetch(sampler2DRect sampler, ivec2 P) {
+ P = clamp2D(P, sampler);
+ switch (sampler->format) {
+ case TextureFormat::RGBA8:
+ return texelFetchRGBA8(sampler, P);
+ case TextureFormat::R8:
+ return texelFetchR8(sampler, P);
+ case TextureFormat::RG8:
+ return texelFetchRG8(sampler, P);
+ case TextureFormat::R16:
+ return texelFetchR16(sampler, P);
+ case TextureFormat::YUV422:
+ return texelFetchYUV422(sampler, P);
+ default:
+ assert(false);
+ return vec4();
+ }
+}
+
+SI vec4 texelFetch(sampler2DArray sampler, ivec3 P, int lod) {
+ assert(lod == 0);
+ P = clamp2DArray(P, sampler);
+ switch (sampler->format) {
+ case TextureFormat::RGBA32F:
+ return texelFetchFloat(sampler, P);
+ case TextureFormat::RGBA8:
+ return texelFetchRGBA8(sampler, P);
+ case TextureFormat::R8:
+ return texelFetchR8(sampler, P);
+ case TextureFormat::RG8:
+ return texelFetchRG8(sampler, P);
+ case TextureFormat::R16:
+ return texelFetchR16(sampler, P);
+ default:
+ assert(false);
+ return vec4();
+ }
+}
+
+vec4 texelFetch(sampler2DArrayRGBA32F sampler, ivec3 P, int lod) {
+ assert(lod == 0);
+ P = clamp2DArray(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return texelFetchFloat(sampler, P);
+}
+
+vec4 texelFetch(sampler2DArrayRGBA8 sampler, ivec3 P, int lod) {
+ assert(lod == 0);
+ P = clamp2DArray(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA8);
+ return texelFetchRGBA8(sampler, P);
+}
+
+vec4 texelFetch(sampler2DArrayR8 sampler, ivec3 P, int lod) {
+ assert(lod == 0);
+ P = clamp2DArray(P, sampler);
+ assert(sampler->format == TextureFormat::R8);
+ return texelFetchR8(sampler, P);
+}
+
+vec4 texelFetch(sampler2DArrayRG8 sampler, ivec3 P, int lod) {
+ assert(lod == 0);
+ P = clamp2DArray(P, sampler);
+ assert(sampler->format == TextureFormat::RG8);
+ return texelFetchRG8(sampler, P);
+}
+
+template <typename S>
+SI ivec4 fetchOffsetsInt(S sampler, I32 offset) {
+ return pixel_int_to_ivec4(
+ *(I32*)&sampler->buf[offset.x], *(I32*)&sampler->buf[offset.y],
+ *(I32*)&sampler->buf[offset.z], *(I32*)&sampler->buf[offset.w]);
+}
+
+ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32I);
+ I32 offset = P.x * 4 + P.y * sampler->stride;
+ return fetchOffsetsInt(sampler, offset);
+}
+
+ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32I);
+ return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+SI vec4_scalar* texelFetchPtr(sampler2D sampler, ivec2_scalar P, int min_x,
+ int max_x, int min_y, int max_y) {
+ P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
+ P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return (vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+SI ivec4_scalar* texelFetchPtr(isampler2D sampler, ivec2_scalar P, int min_x,
+ int max_x, int min_y, int max_y) {
+ P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
+ P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
+ assert(sampler->format == TextureFormat::RGBA32I);
+ return (ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+template <typename S>
+SI I32 texelFetchPtr(S sampler, ivec2 P, int min_x, int max_x, int min_y,
+ int max_y) {
+ P.x = clampCoord(P.x, int(sampler->width) - max_x, -min_x);
+ P.y = clampCoord(P.y, int(sampler->height) - max_y, -min_y);
+ return P.x * 4 + P.y * sampler->stride;
+}
+
+template <typename S, typename P>
+SI P texelFetchUnchecked(S sampler, P* ptr, int x, int y = 0) {
+ return ptr[x + y * (sampler->stride >> 2)];
+}
+
+SI vec4 texelFetchUnchecked(sampler2D sampler, I32 offset, int x, int y = 0) {
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return fetchOffsetsFloat(sampler, offset + (x * 4 + y * sampler->stride));
+}
+
+SI ivec4 texelFetchUnchecked(isampler2D sampler, I32 offset, int x, int y = 0) {
+ assert(sampler->format == TextureFormat::RGBA32I);
+ return fetchOffsetsInt(sampler, offset + (x * 4 + y * sampler->stride));
+}
+
+#define texelFetchOffset(sampler, P, lod, offset) \
+ texelFetch(sampler, (P) + (offset), lod)
+
+// Scale texture coords for quantization, subtract offset for filtering
+// (assuming coords already offset to texel centers), and round to nearest
+// 1/scale increment
+template <typename T>
+SI T linearQuantize(T P, float scale) {
+ return P * scale + (0.5f - 0.5f * scale);
+}
+
+// Helper version that also scales normalized texture coords for sampler
+template <typename T, typename S>
+SI T samplerScale(S sampler, T P) {
+ P.x *= sampler->width;
+ P.y *= sampler->height;
+ return P;
+}
+
+template <typename T>
+SI T samplerScale(sampler2DRect sampler, T P) {
+ return P;
+}
+
+template <typename T, typename S>
+SI T linearQuantize(T P, float scale, S sampler) {
+ return linearQuantize(samplerScale(sampler, P), scale);
+}
+
+// Compute clamped offset of first row for linear interpolation
+template <typename S, typename I>
+SI auto computeRow(S sampler, I i, int32_t zoffset, size_t margin = 1)
+ -> decltype(i.x) {
+ return clampCoord(i.x, sampler->width - margin) +
+ clampCoord(i.y, sampler->height) * sampler->stride + zoffset;
+}
+
+// Compute clamped offset of second row for linear interpolation from first row
+template <typename S>
+SI I32 computeNextRowOffset(S sampler, ivec2 i) {
+ return (i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
+ I32(sampler->stride);
+}
+
+// Convert X coordinate to a 2^7 scale fraction for interpolation
+template <typename S>
+SI I16 computeFracX(S sampler, ivec2 i, ivec2 frac) {
+ auto overread = i.x > int32_t(sampler->width) - 2;
+ return CONVERT((((frac.x & (i.x >= 0)) | overread) & 0x7F) - overread, I16);
+}
+
+// Convert Y coordinate to a 2^7 scale fraction for interpolation
+SI I16 computeFracY(ivec2 frac) { return CONVERT(frac.y & 0x7F, I16); }
+
+struct WidePlanarRGBA8 {
+ V8<uint16_t> rg;
+ V8<uint16_t> ba;
+};
+
+template <typename S>
+SI WidePlanarRGBA8 textureLinearPlanarRGBA8(S sampler, ivec2 i,
+ int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::RGBA8);
+
+ ivec2 frac = i;
+ i >>= 7;
+
+ I32 row0 = computeRow(sampler, i, zoffset);
+ I32 row1 = row0 + computeNextRowOffset(sampler, i);
+ I16 fracx = computeFracX(sampler, i, frac);
+ I16 fracy = computeFracY(frac);
+
+ auto a0 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);
+ auto a1 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>);
+ a0 += ((a1 - a0) * fracy.x) >> 7;
+
+ auto b0 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>);
+ auto b1 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>);
+ b0 += ((b1 - b0) * fracy.y) >> 7;
+
+ auto abl = zipLow(a0, b0);
+ auto abh = zipHigh(a0, b0);
+ abl += ((abh - abl) * fracx.xyxyxyxy) >> 7;
+
+ auto c0 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>);
+ auto c1 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>);
+ c0 += ((c1 - c0) * fracy.z) >> 7;
+
+ auto d0 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>);
+ auto d1 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>);
+ d0 += ((d1 - d0) * fracy.w) >> 7;
+
+ auto cdl = zipLow(c0, d0);
+ auto cdh = zipHigh(c0, d0);
+ cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7;
+
+ auto rg = V8<uint16_t>(zip2Low(abl, cdl));
+ auto ba = V8<uint16_t>(zip2High(abl, cdl));
+ return WidePlanarRGBA8{rg, ba};
+}
+
+template <typename S>
+vec4 textureLinearRGBA8(S sampler, vec2 P, int32_t zoffset = 0) {
+ ivec2 i(linearQuantize(P, 128, sampler));
+ auto planar = textureLinearPlanarRGBA8(sampler, i, zoffset);
+ auto rg = CONVERT(planar.rg, V8<float>);
+ auto ba = CONVERT(planar.ba, V8<float>);
+ auto r = lowHalf(rg);
+ auto g = highHalf(rg);
+ auto b = lowHalf(ba);
+ auto a = highHalf(ba);
+ return vec4(b, g, r, a) * (1.0f / 255.0f);
+}
+
+template <typename S>
+static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i,
+ int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::R8);
+ ivec2 frac = i;
+ i >>= 7;
+
+ I32 row0 = computeRow(sampler, i, zoffset);
+ I32 row1 = row0 + computeNextRowOffset(sampler, i);
+ I16 fracx = computeFracX(sampler, i, frac);
+ I16 fracy = computeFracY(frac);
+
+ uint8_t* buf = (uint8_t*)sampler->buf;
+ auto a0 = unaligned_load<V2<uint8_t>>(&buf[row0.x]);
+ auto b0 = unaligned_load<V2<uint8_t>>(&buf[row0.y]);
+ auto c0 = unaligned_load<V2<uint8_t>>(&buf[row0.z]);
+ auto d0 = unaligned_load<V2<uint8_t>>(&buf[row0.w]);
+ auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8<int16_t>);
+
+ auto a1 = unaligned_load<V2<uint8_t>>(&buf[row1.x]);
+ auto b1 = unaligned_load<V2<uint8_t>>(&buf[row1.y]);
+ auto c1 = unaligned_load<V2<uint8_t>>(&buf[row1.z]);
+ auto d1 = unaligned_load<V2<uint8_t>>(&buf[row1.w]);
+ auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8<int16_t>);
+
+ abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7;
+
+ abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
+ auto abcdl = lowHalf(abcd0);
+ auto abcdh = highHalf(abcd0);
+ abcdl += ((abcdh - abcdl) * fracx) >> 7;
+
+ return U16(abcdl);
+}
+
+template <typename S>
+vec4 textureLinearR8(S sampler, vec2 P, int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::R8);
+
+ ivec2 i(linearQuantize(P, 128, sampler));
+ Float r = CONVERT(textureLinearUnpackedR8(sampler, i, zoffset), Float);
+ return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f);
+}
+
+struct WidePlanarRG8 {
+ V8<uint16_t> rg;
+};
+
+template <typename S>
+SI WidePlanarRG8 textureLinearPlanarRG8(S sampler, ivec2 i,
+ int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::RG8);
+
+ ivec2 frac = i;
+ i >>= 7;
+
+ I32 row0 = computeRow(sampler, i, zoffset);
+ I32 row1 = row0 + computeNextRowOffset(sampler, i);
+ I16 fracx = computeFracX(sampler, i, frac);
+ I16 fracy = computeFracY(frac);
+
+ uint16_t* buf = (uint16_t*)sampler->buf;
+
+ // Load RG bytes for two adjacent pixels - rgRG
+ auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);
+ auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);
+ auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);
+ // Load two pixels for next row
+ auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);
+ auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);
+ auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);
+ // Blend rows
+ ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;
+
+ auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);
+ auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);
+ auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);
+ auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);
+ auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);
+ auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);
+ // Blend rows
+ cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;
+
+ // ab = a.rgRG,b.rgRG
+ // cd = c.rgRG,d.rgRG
+ // ... ac = ar,cr,ag,cg,aR,cR,aG,cG
+ // ... bd = br,dr,bg,dg,bR,dR,bG,dG
+ auto ac = zipLow(ab0, cd0);
+ auto bd = zipHigh(ab0, cd0);
+ // ar,br,cr,dr,ag,bg,cg,dg
+ // aR,bR,cR,dR,aG,bG,cG,dG
+ auto abcdl = zipLow(ac, bd);
+ auto abcdh = zipHigh(ac, bd);
+ // Blend columns
+ abcdl += ((abcdh - abcdl) * fracx.xyzwxyzw) >> 7;
+
+ auto rg = V8<uint16_t>(abcdl);
+ return WidePlanarRG8{rg};
+}
+
+template <typename S>
+vec4 textureLinearRG8(S sampler, vec2 P, int32_t zoffset = 0) {
+ ivec2 i(linearQuantize(P, 128, sampler));
+ auto planar = textureLinearPlanarRG8(sampler, i, zoffset);
+ auto rg = CONVERT(planar.rg, V8<float>) * (1.0f / 255.0f);
+ auto r = lowHalf(rg);
+ auto g = highHalf(rg);
+ return vec4(r, g, 0.0f, 1.0f);
+}
+
+// Samples R16 texture with linear filtering and returns results packed as
+// signed I16. One bit of precision is shifted away from the bottom end to
+// accommodate the sign bit, so only 15 bits of precision is left.
+template <typename S>
+static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i,
+ int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::R16);
+
+ ivec2 frac = i;
+ i >>= 7;
+
+ I32 row0 = computeRow(sampler, i, zoffset);
+ I32 row1 = row0 + computeNextRowOffset(sampler, i);
+
+ I16 fracx =
+ CONVERT(
+ ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F,
+ I16)
+ << 8;
+ I16 fracy = computeFracY(frac) << 8;
+
+ // Sample the 16 bit data for both rows
+ uint16_t* buf = (uint16_t*)sampler->buf;
+ auto a0 = unaligned_load<V2<uint16_t>>(&buf[row0.x]);
+ auto b0 = unaligned_load<V2<uint16_t>>(&buf[row0.y]);
+ auto c0 = unaligned_load<V2<uint16_t>>(&buf[row0.z]);
+ auto d0 = unaligned_load<V2<uint16_t>>(&buf[row0.w]);
+ auto abcd0 =
+ CONVERT(combine(combine(a0, b0), combine(c0, d0)) >> 1, V8<int16_t>);
+
+ auto a1 = unaligned_load<V2<uint16_t>>(&buf[row1.x]);
+ auto b1 = unaligned_load<V2<uint16_t>>(&buf[row1.y]);
+ auto c1 = unaligned_load<V2<uint16_t>>(&buf[row1.z]);
+ auto d1 = unaligned_load<V2<uint16_t>>(&buf[row1.w]);
+ auto abcd1 =
+ CONVERT(combine(combine(a1, b1), combine(c1, d1)) >> 1, V8<int16_t>);
+
+ // The samples occupy 15 bits and the fraction occupies 15 bits, so that when
+ // they are multiplied together, the new scaled sample will fit in the high
+ // 14 bits of the result. It is left shifted once to make it 15 bits again
+ // for the final multiply.
+#if USE_SSE2
+ abcd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcd1 - abcd0, fracy.xxyyzzww))
+ << 1;
+#elif USE_NEON
+ // NEON has a convenient instruction that does both the multiply and the
+ // doubling, so doesn't need an extra shift.
+ abcd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcd1 - abcd0, fracy.xxyyzzww));
+#else
+ abcd0 += CONVERT((CONVERT(abcd1 - abcd0, V8<int32_t>) *
+ CONVERT(fracy.xxyyzzww, V8<int32_t>)) >>
+ 16,
+ V8<int16_t>)
+ << 1;
+#endif
+
+ abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
+ auto abcdl = lowHalf(abcd0);
+ auto abcdh = highHalf(abcd0);
+#if USE_SSE2
+ abcdl += lowHalf(bit_cast<V8<int16_t>>(
+ _mm_mulhi_epi16(expand(abcdh - abcdl), expand(fracx))))
+ << 1;
+#elif USE_NEON
+ abcdl += bit_cast<V4<int16_t>>(vqrdmulh_s16(abcdh - abcdl, fracx));
+#else
+ abcdl += CONVERT((CONVERT(abcdh - abcdl, V4<int32_t>) *
+ CONVERT(fracx, V4<int32_t>)) >>
+ 16,
+ V4<int16_t>)
+ << 1;
+#endif
+
+ return abcdl;
+}
+
+template <typename S>
+vec4 textureLinearR16(S sampler, vec2 P, int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::R16);
+
+ ivec2 i(linearQuantize(P, 128, sampler));
+ Float r = CONVERT(textureLinearUnpackedR16(sampler, i, zoffset), Float);
+ return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f);
+}
+
+template <typename S>
+vec4 textureLinearRGBA32F(S sampler, vec2 P, int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::RGBA32F);
+ P = samplerScale(sampler, P);
+ P -= 0.5f;
+ vec2 f = floor(P);
+ vec2 r = P - f;
+ ivec2 i(f);
+ ivec2 c(clampCoord(i.x, sampler->width - 1),
+ clampCoord(i.y, sampler->height));
+ r.x = if_then_else(i.x >= 0, if_then_else(i.x < sampler->width - 1, r.x, 1.0),
+ 0.0f);
+ I32 offset0 = c.x * 4 + c.y * sampler->stride + zoffset;
+ I32 offset1 = offset0 + computeNextRowOffset(sampler, i);
+
+ Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x],
+ *(Float*)&sampler->buf[offset0.x + 4], r.x),
+ mix(*(Float*)&sampler->buf[offset1.x],
+ *(Float*)&sampler->buf[offset1.x + 4], r.x),
+ r.y);
+ Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y],
+ *(Float*)&sampler->buf[offset0.y + 4], r.x),
+ mix(*(Float*)&sampler->buf[offset1.y],
+ *(Float*)&sampler->buf[offset1.y + 4], r.x),
+ r.y);
+ Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z],
+ *(Float*)&sampler->buf[offset0.z + 4], r.x),
+ mix(*(Float*)&sampler->buf[offset1.z],
+ *(Float*)&sampler->buf[offset1.z + 4], r.x),
+ r.y);
+ Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w],
+ *(Float*)&sampler->buf[offset0.w + 4], r.x),
+ mix(*(Float*)&sampler->buf[offset1.w],
+ *(Float*)&sampler->buf[offset1.w + 4], r.x),
+ r.y);
+ return pixel_float_to_vec4(c0, c1, c2, c3);
+}
+
+struct WidePlanarYUV8 {
+ U16 y;
+ U16 u;
+ U16 v;
+};
+
+template <typename S>
+SI WidePlanarYUV8 textureLinearPlanarYUV422(S sampler, ivec2 i,
+ int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::YUV422);
+
+ ivec2 frac = i;
+ i >>= 7;
+
+ I32 row0 = computeRow(sampler, i, zoffset, 2);
+ // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.
+ // Get the selector for the pixel within the chunk.
+ I32 selector = row0 & 1;
+ // Align the row index to the chunk.
+ row0 &= ~1;
+ I32 row1 = row0 + computeNextRowOffset(sampler, i);
+ // G only needs to be clamped to a pixel boundary for safe interpolation,
+ // whereas the BR fraction needs to be clamped 1 extra pixel inside to a chunk
+ // boundary.
+ frac.x &= (i.x >= 0);
+ auto fracx =
+ CONVERT(combine(frac.x | (i.x > int32_t(sampler->width) - 3),
+ (frac.x >> 1) | (i.x > int32_t(sampler->width) - 3)) &
+ 0x7F,
+ V8<int16_t>);
+ I16 fracy = computeFracY(frac);
+
+ uint16_t* buf = (uint16_t*)sampler->buf;
+
+ // Load bytes for two adjacent chunks - g0,b,g1,r,G0,B,G1,R
+ // We always need to interpolate between (b,r) and (B,R).
+ // Depending on selector we need to either interpolate between g0 and g1
+ // or between g1 and G0. So for now we just interpolate both cases for g
+ // and will select the appropriate one on output.
+ auto a0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.x]), V8<int16_t>);
+ auto a1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.x]), V8<int16_t>);
+ // Combine with next row.
+ a0 += ((a1 - a0) * fracy.x) >> 7;
+
+ auto b0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.y]), V8<int16_t>);
+ auto b1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.y]), V8<int16_t>);
+ b0 += ((b1 - b0) * fracy.y) >> 7;
+
+ auto c0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.z]), V8<int16_t>);
+ auto c1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.z]), V8<int16_t>);
+ c0 += ((c1 - c0) * fracy.z) >> 7;
+
+ auto d0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.w]), V8<int16_t>);
+ auto d1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.w]), V8<int16_t>);
+ d0 += ((d1 - d0) * fracy.w) >> 7;
+
+ // Shuffle things around so we end up with g0,g0,g0,g0,b,b,b,b and
+ // g1,g1,g1,g1,r,r,r,r.
+ auto abl = zipLow(a0, b0);
+ auto cdl = zipLow(c0, d0);
+ auto g0b = zip2Low(abl, cdl);
+ auto g1r = zip2High(abl, cdl);
+
+ // Need to zip g1,B,G0,R. Instead of using a bunch of complicated masks and
+ // and shifts, just shuffle here instead... We finally end up with
+ // g1,g1,g1,g1,B,B,B,B and G0,G0,G0,G0,R,R,R,R.
+ auto abh = SHUFFLE(a0, b0, 2, 10, 5, 13, 4, 12, 7, 15);
+ auto cdh = SHUFFLE(c0, d0, 2, 10, 5, 13, 4, 12, 7, 15);
+ auto g1B = zip2Low(abh, cdh);
+ auto G0R = zip2High(abh, cdh);
+
+ // Finally interpolate between adjacent columns.
+ g0b += ((g1B - g0b) * fracx) >> 7;
+ g1r += ((G0R - g1r) * fracx) >> 7;
+
+ // Choose either g0 or g1 based on selector.
+ return WidePlanarYUV8{
+ U16(if_then_else(CONVERT(-selector, I16), lowHalf(g1r), lowHalf(g0b))),
+ U16(highHalf(g0b)), U16(highHalf(g1r))};
+}
+
+template <typename S>
+vec4 textureLinearYUV422(S sampler, vec2 P, int32_t zoffset = 0) {
+ ivec2 i(linearQuantize(P, 128, sampler));
+ auto planar = textureLinearPlanarYUV422(sampler, i, zoffset);
+ auto y = CONVERT(planar.y, Float) * (1.0f / 255.0f);
+ auto u = CONVERT(planar.u, Float) * (1.0f / 255.0f);
+ auto v = CONVERT(planar.v, Float) * (1.0f / 255.0f);
+ return vec4(v, y, u, 1.0f);
+}
+
+SI vec4 texture(sampler2D sampler, vec2 P) {
+ if (sampler->filter == TextureFilter::LINEAR) {
+ switch (sampler->format) {
+ case TextureFormat::RGBA32F:
+ return textureLinearRGBA32F(sampler, P);
+ case TextureFormat::RGBA8:
+ return textureLinearRGBA8(sampler, P);
+ case TextureFormat::R8:
+ return textureLinearR8(sampler, P);
+ case TextureFormat::RG8:
+ return textureLinearRG8(sampler, P);
+ case TextureFormat::R16:
+ return textureLinearR16(sampler, P);
+ case TextureFormat::YUV422:
+ return textureLinearYUV422(sampler, P);
+ default:
+ assert(false);
+ return vec4();
+ }
+ } else {
+ ivec2 coord(roundzero(P.x, sampler->width),
+ roundzero(P.y, sampler->height));
+ return texelFetch(sampler, coord, 0);
+ }
+}
+
+vec4 texture(sampler2DRect sampler, vec2 P) {
+ if (sampler->filter == TextureFilter::LINEAR) {
+ switch (sampler->format) {
+ case TextureFormat::RGBA8:
+ return textureLinearRGBA8(sampler, P);
+ case TextureFormat::R8:
+ return textureLinearR8(sampler, P);
+ case TextureFormat::RG8:
+ return textureLinearRG8(sampler, P);
+ case TextureFormat::R16:
+ return textureLinearR16(sampler, P);
+ case TextureFormat::YUV422:
+ return textureLinearYUV422(sampler, P);
+ default:
+ assert(false);
+ return vec4();
+ }
+ } else {
+ ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f));
+ return texelFetch(sampler, coord);
+ }
+}
+
+SI vec4 texture(sampler2DArray sampler, vec3 P) {
+ if (sampler->filter == TextureFilter::LINEAR) {
+ // SSE2 can generate slow code for 32-bit multiply, and we never actually
+ // sample from different layers in one chunk, so do cheaper scalar
+ // multiplication instead.
+ assert(test_all(P.z == P.z.x));
+ int32_t zoffset = clampCoord(roundeven(P.z.x, 1.0f), sampler->depth) *
+ sampler->height_stride;
+ switch (sampler->format) {
+ case TextureFormat::RGBA32F:
+ return textureLinearRGBA32F(sampler, vec2(P.x, P.y), zoffset);
+ case TextureFormat::RGBA8:
+ return textureLinearRGBA8(sampler, vec2(P.x, P.y), zoffset);
+ case TextureFormat::R8:
+ return textureLinearR8(sampler, vec2(P.x, P.y), zoffset);
+ case TextureFormat::RG8:
+ return textureLinearRG8(sampler, vec2(P.x, P.y), zoffset);
+ case TextureFormat::R16:
+ return textureLinearR16(sampler, vec2(P.x, P.y), zoffset);
+ default:
+ assert(false);
+ return vec4();
+ }
+ } else {
+ // just do nearest for now
+ ivec3 coord(roundzero(P.x, sampler->width), roundzero(P.y, sampler->height),
+ roundeven(P.z, 1.0f));
+ return texelFetch(sampler, coord, 0);
+ }
+}
+
+vec4 texture(sampler2DArray sampler, vec3 P, float bias) {
+ assert(bias == 0.0f);
+ return texture(sampler, P);
+}
+
+vec4 textureLod(sampler2DArray sampler, vec3 P, float lod) {
+ assert(lod == 0.0f);
+ return texture(sampler, P);
+}
+
+ivec3_scalar textureSize(sampler2DArray sampler, int) {
+ return ivec3_scalar{int32_t(sampler->width), int32_t(sampler->height),
+ int32_t(sampler->depth)};
+}
+
+ivec2_scalar textureSize(sampler2D sampler, int) {
+ return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
+}
+
+ivec2_scalar textureSize(sampler2DRect sampler) {
+ return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
+}
+
+template <typename S>
+static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i,
+ int zoffset = 0) {
+ assert(sampler->format == TextureFormat::RGBA8);
+ ivec2 frac = i;
+ i >>= 7;
+
+ I32 row0 = computeRow(sampler, i, zoffset);
+ I32 row1 = row0 + computeNextRowOffset(sampler, i);
+ I16 fracx = computeFracX(sampler, i, frac);
+ I16 fracy = computeFracY(frac);
+
+ auto a0 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);
+ auto a1 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>);
+ a0 += ((a1 - a0) * fracy.x) >> 7;
+
+ auto b0 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>);
+ auto b1 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>);
+ b0 += ((b1 - b0) * fracy.y) >> 7;
+
+ auto abl = combine(lowHalf(a0), lowHalf(b0));
+ auto abh = combine(highHalf(a0), highHalf(b0));
+ abl += ((abh - abl) * fracx.xxxxyyyy) >> 7;
+
+ auto c0 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>);
+ auto c1 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>);
+ c0 += ((c1 - c0) * fracy.z) >> 7;
+
+ auto d0 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>);
+ auto d1 =
+ CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>);
+ d0 += ((d1 - d0) * fracy.w) >> 7;
+
+ auto cdl = combine(lowHalf(c0), lowHalf(d0));
+ auto cdh = combine(highHalf(c0), highHalf(d0));
+ cdl += ((cdh - cdl) * fracx.zzzzwwww) >> 7;
+
+ return combine(HalfRGBA8(abl), HalfRGBA8(cdl));
+}
+
+template <typename S>
+static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i,
+ int zoffset = 0) {
+ return pack(textureLinearUnpackedRGBA8(sampler, i, zoffset));
+}
+
+template <typename S>
+static PackedR8 textureLinearPackedR8(S sampler, ivec2 i, int zoffset = 0) {
+ return pack(textureLinearUnpackedR8(sampler, i, zoffset));
+}
+
+template <typename S>
+static WideRG8 textureLinearUnpackedRG8(S sampler, ivec2 i, int zoffset = 0) {
+ assert(sampler->format == TextureFormat::RG8);
+ ivec2 frac = i & 0x7F;
+ i >>= 7;
+
+ I32 row0 = computeRow(sampler, i, zoffset);
+ I32 row1 = row0 + computeNextRowOffset(sampler, i);
+ I16 fracx = computeFracX(sampler, i, frac);
+ I16 fracy = computeFracY(frac);
+
+ uint16_t* buf = (uint16_t*)sampler->buf;
+
+ // Load RG bytes for two adjacent pixels - rgRG
+ auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);
+ auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);
+ auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);
+ // Load two pixels for next row
+ auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);
+ auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);
+ auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);
+ // Blend rows
+ ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;
+
+ auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);
+ auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);
+ auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);
+ auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);
+ auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);
+ auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);
+ // Blend rows
+ cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;
+
+ // ab = a.rgRG,b.rgRG
+ // cd = c.rgRG,d.rgRG
+ // ... ac = a.rg,c.rg,a.RG,c.RG
+ // ... bd = b.rg,d.rg,b.RG,d.RG
+ auto ac = zip2Low(ab0, cd0);
+ auto bd = zip2High(ab0, cd0);
+ // a.rg,b.rg,c.rg,d.rg
+ // a.RG,b.RG,c.RG,d.RG
+ auto abcdl = zip2Low(ac, bd);
+ auto abcdh = zip2High(ac, bd);
+ // Blend columns
+ abcdl += ((abcdh - abcdl) * fracx.xxyyzzww) >> 7;
+
+ return WideRG8(abcdl);
+}
+
+template <typename S>
+static PackedRG8 textureLinearPackedRG8(S sampler, ivec2 i, int zoffset = 0) {
+ return pack(textureLinearUnpackedRG8(sampler, i, zoffset));
+}
+
+template <int N>
+static ALWAYS_INLINE VectorType<uint16_t, N> addsat(VectorType<uint16_t, N> x,
+ VectorType<uint16_t, N> y) {
+ auto r = x + y;
+ return r | (r < x);
+}
+
+template <typename P, typename S>
+static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurHorizontal(
+ S sampler, const ivec2_scalar& i, int minX, int maxX, int radius,
+ float coeff, float coeffStep, int zoffset = 0) {
+ // Packed and unpacked vectors for a chunk of the given pixel type.
+ typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
+ typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
+
+ // Pre-scale the coefficient by 8 bits of fractional precision, so that when
+ // the sample is multiplied by it, it will yield a 16 bit unsigned integer
+ // that will use all 16 bits of precision to accumulate the sum.
+ coeff *= 1 << 8;
+ float coeffStep2 = coeffStep * coeffStep;
+
+ int row = computeRow(sampler, i, zoffset);
+ P* buf = (P*)sampler->buf;
+ auto pixelsRight = unaligned_load<V4<P>>(&buf[row]);
+ auto pixelsLeft = pixelsRight;
+ auto sum = CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) *
+ uint16_t(coeff + 0.5f);
+
+ // Here we use some trickery to reuse the pixels within a chunk, shifted over
+ // by one pixel, to get the next sample for the entire chunk. This allows us
+ // to sample only one pixel for each offset across the entire chunk in both
+ // the left and right directions. To avoid clamping within the loop to the
+ // texture bounds, we compute the valid radius that doesn't require clamping
+ // and fall back to a slower clamping loop outside of that valid radius.
+ int offset = 1;
+ int leftBound = i.x - max(minX, 0);
+ int rightBound = min(maxX, sampler->width) - (i.x + 4);
+ int validRadius = min(radius, min(leftBound, rightBound));
+ for (; offset <= validRadius; offset++) {
+ // Overwrite the pixel that needs to be shifted out with the new pixel, and
+ // shift it into the correct location.
+ pixelsRight.x = unaligned_load<P>(&buf[row + offset + 4 - 1]);
+ pixelsRight = pixelsRight.yzwx;
+ pixelsLeft = pixelsLeft.wxyz;
+ pixelsLeft.x = unaligned_load<P>(&buf[row - offset]);
+
+ // Accumulate the Gaussian coefficients step-wise.
+ coeff *= coeffStep;
+ coeffStep *= coeffStep2;
+
+ // Both left and right samples at this offset use the same coefficient.
+ sum = addsat(sum,
+ (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +
+ CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *
+ uint16_t(coeff + 0.5f));
+ }
+
+ for (; offset <= radius; offset++) {
+ pixelsRight.x =
+ unaligned_load<P>(&buf[row + min(offset + 4 - 1, rightBound)]);
+ pixelsRight = pixelsRight.yzwx;
+ pixelsLeft = pixelsLeft.wxyz;
+ pixelsLeft.x = unaligned_load<P>(&buf[row - min(offset, leftBound)]);
+
+ coeff *= coeffStep;
+ coeffStep *= coeffStep2;
+
+ sum = addsat(sum,
+ (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +
+ CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *
+ uint16_t(coeff + 0.5f));
+ }
+
+ // Shift away the intermediate precision.
+ return sum >> 8;
+}
+
+template <typename P, typename S>
+static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurVertical(
+ S sampler, const ivec2_scalar& i, int minY, int maxY, int radius,
+ float coeff, float coeffStep, int zoffset = 0) {
+ // Packed and unpacked vectors for a chunk of the given pixel type.
+ typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
+ typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
+
+ // Pre-scale the coefficient by 8 bits of fractional precision, so that when
+ // the sample is multiplied by it, it will yield a 16 bit unsigned integer
+ // that will use all 16 bits of precision to accumulate the sum.
+ coeff *= 1 << 8;
+ float coeffStep2 = coeffStep * coeffStep;
+
+ int rowAbove = computeRow(sampler, i, zoffset);
+ int rowBelow = rowAbove;
+ P* buf = (P*)sampler->buf;
+ auto pixels = unaligned_load<V4<P>>(&buf[rowAbove]);
+ auto sum = CONVERT(bit_cast<packed_type>(pixels), unpacked_type) *
+ uint16_t(coeff + 0.5f);
+
+ // For the vertical loop we can't be quite as creative with reusing old values
+ // as we were in the horizontal loop. We just do the obvious implementation of
+ // loading a chunk from each row in turn and accumulating it into the sum. We
+ // compute a valid radius within which we don't need to clamp the sampled row
+ // and use that to avoid any clamping in the main inner loop. We fall back to
+ // a slower clamping loop outside of that valid radius.
+ int offset = 1;
+ int belowBound = i.y - max(minY, 0);
+ int aboveBound = min(maxY, sampler->height) - (i.y + 1);
+ int validRadius = min(radius, min(belowBound, aboveBound));
+ for (; offset <= validRadius; offset++) {
+ rowAbove += sampler->stride;
+ rowBelow -= sampler->stride;
+ auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);
+ auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);
+
+ // Accumulate the Gaussian coefficients step-wise.
+ coeff *= coeffStep;
+ coeffStep *= coeffStep2;
+
+ // Both above and below samples at this offset use the same coefficient.
+ sum = addsat(sum,
+ (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +
+ CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *
+ uint16_t(coeff + 0.5f));
+ }
+
+ for (; offset <= radius; offset++) {
+ if (offset <= aboveBound) {
+ rowAbove += sampler->stride;
+ }
+ if (offset <= belowBound) {
+ rowBelow -= sampler->stride;
+ }
+ auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);
+ auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);
+
+ coeff *= coeffStep;
+ coeffStep *= coeffStep2;
+
+ sum = addsat(sum,
+ (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +
+ CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *
+ uint16_t(coeff + 0.5f));
+ }
+
+ // Shift away the intermediate precision.
+ return sum >> 8;
+}
+
+} // namespace glsl
diff --git a/gfx/wr/swgl/src/vector_type.h b/gfx/wr/swgl/src/vector_type.h
new file mode 100644
index 0000000000..1d4fc8db1a
--- /dev/null
+++ b/gfx/wr/swgl/src/vector_type.h
@@ -0,0 +1,514 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef __clang__
+# ifdef __SSE2__
+# include <xmmintrin.h>
+# define USE_SSE2 1
+# endif
+# ifdef __ARM_NEON
+# include <arm_neon.h>
+# define USE_NEON 1
+# endif
+#endif
+
+namespace glsl {
+
+#ifdef __clang__
+template <typename T, int N>
+using VectorType = T __attribute__((ext_vector_type(N)));
+
+# define CONVERT(vector, type) __builtin_convertvector(vector, type)
+# define SHUFFLE(a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__)
+
+template <typename T>
+SI VectorType<T, 4> combine(VectorType<T, 2> a, VectorType<T, 2> b) {
+ return __builtin_shufflevector(a, b, 0, 1, 2, 3);
+}
+
+template <typename T>
+SI VectorType<T, 8> combine(VectorType<T, 4> a, VectorType<T, 4> b) {
+ return __builtin_shufflevector(a, b, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+template <typename T>
+SI VectorType<T, 16> combine(VectorType<T, 8> a, VectorType<T, 8> b) {
+ return __builtin_shufflevector(a, b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15);
+}
+
+template <typename T>
+SI VectorType<T, 2> lowHalf(VectorType<T, 4> a) {
+ return __builtin_shufflevector(a, a, 0, 1);
+}
+
+template <typename T>
+SI VectorType<T, 2> highHalf(VectorType<T, 4> a) {
+ return __builtin_shufflevector(a, a, 2, 3);
+}
+
+template <typename T>
+SI VectorType<T, 4> lowHalf(VectorType<T, 8> a) {
+ return __builtin_shufflevector(a, a, 0, 1, 2, 3);
+}
+
+template <typename T>
+SI VectorType<T, 4> highHalf(VectorType<T, 8> a) {
+ return __builtin_shufflevector(a, a, 4, 5, 6, 7);
+}
+
+template <typename T>
+SI VectorType<T, 8> lowHalf(VectorType<T, 16> a) {
+ return __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+template <typename T>
+SI VectorType<T, 8> highHalf(VectorType<T, 16> a) {
+ return __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+template <typename T>
+SI VectorType<T, 8> expand(VectorType<T, 4> a) {
+ return __builtin_shufflevector(a, a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+#else
+template <typename T>
+struct VectorMask {
+ typedef T type;
+};
+template <>
+struct VectorMask<uint32_t> {
+ typedef int32_t type;
+};
+template <>
+struct VectorMask<uint16_t> {
+ typedef int16_t type;
+};
+template <>
+struct VectorMask<uint8_t> {
+ typedef int8_t type;
+};
+template <>
+struct VectorMask<float> {
+ typedef int type;
+};
+
+template <typename T, int N>
+struct VectorType {
+ enum { SIZE = N };
+
+ typedef T data_type __attribute__((vector_size(sizeof(T) * N)));
+ typedef typename VectorMask<T>::type mask_index;
+ typedef mask_index mask_type
+ __attribute__((vector_size(sizeof(mask_index) * N)));
+ typedef T half_type __attribute__((vector_size(sizeof(T) * (N / 2))));
+ union {
+ data_type data;
+ struct {
+ T x, y, z, w;
+ };
+ T elements[N];
+ struct {
+ half_type low_half, high_half;
+ };
+ };
+
+ VectorType() : data{0} {}
+
+ constexpr VectorType(const VectorType& rhs) : data(rhs.data) {}
+ // GCC vector extensions only support broadcasting scalars on arithmetic ops,
+ // but not on initializers, hence the following...
+ constexpr VectorType(T n) : data((data_type){0} + n) {}
+ constexpr VectorType(T a, T b, T c, T d) : data{a, b, c, d} {}
+ constexpr VectorType(T a, T b, T c, T d, T e, T f, T g, T h)
+ : data{a, b, c, d, e, f, g, h} {}
+ constexpr VectorType(T a, T b, T c, T d, T e, T f, T g, T h, T i, T j, T k,
+ T l, T m, T n, T o, T p)
+ : data{a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p} {}
+
+ SI VectorType wrap(const data_type& data) {
+ VectorType v;
+ v.data = data;
+ return v;
+ }
+
+ T& operator[](size_t i) { return elements[i]; }
+ T operator[](size_t i) const { return elements[i]; }
+
+ template <typename U>
+ operator VectorType<U, 2>() const {
+ return VectorType<U, 2>::wrap(
+ (typename VectorType<U, N>::data_type){U(x), U(y)});
+ }
+ template <typename U>
+ operator VectorType<U, 4>() const {
+ return VectorType<U, 4>::wrap(
+ (typename VectorType<U, N>::data_type){U(x), U(y), U(z), U(w)});
+ }
+ template <typename U>
+ operator VectorType<U, 8>() const {
+ return VectorType<U, 8>::wrap((typename VectorType<U, N>::data_type){
+ U(elements[0]), U(elements[1]), U(elements[2]), U(elements[3]),
+ U(elements[4]), U(elements[5]), U(elements[6]), U(elements[7])});
+ }
+ template <typename U>
+ operator VectorType<U, 16>() const {
+ return VectorType<U, 16>::wrap((typename VectorType<U, N>::data_type){
+ U(elements[0]),
+ U(elements[1]),
+ U(elements[2]),
+ U(elements[3]),
+ U(elements[4]),
+ U(elements[5]),
+ U(elements[6]),
+ U(elements[7]),
+ U(elements[8]),
+ U(elements[9]),
+ U(elements[10]),
+ U(elements[11]),
+ U(elements[12]),
+ U(elements[13]),
+ U(elements[14]),
+ U(elements[15]),
+ });
+ }
+
+ VectorType operator-() const { return wrap(-data); }
+ VectorType operator~() const { return wrap(~data); }
+
+ VectorType operator&(VectorType x) const { return wrap(data & x.data); }
+ VectorType operator&(T x) const { return wrap(data & x); }
+ VectorType operator|(VectorType x) const { return wrap(data | x.data); }
+ VectorType operator|(T x) const { return wrap(data | x); }
+ VectorType operator^(VectorType x) const { return wrap(data ^ x.data); }
+ VectorType operator^(T x) const { return wrap(data ^ x); }
+ VectorType operator<<(int x) const { return wrap(data << x); }
+ VectorType operator>>(int x) const { return wrap(data >> x); }
+ VectorType operator+(VectorType x) const { return wrap(data + x.data); }
+ VectorType operator+(T x) const { return wrap(data + x); }
+ friend VectorType operator+(T x, VectorType y) { return wrap(x + y.data); }
+ VectorType operator-(VectorType x) const { return wrap(data - x.data); }
+ VectorType operator-(T x) const { return wrap(data - x); }
+ friend VectorType operator-(T x, VectorType y) { return wrap(x - y.data); }
+ VectorType operator*(VectorType x) const { return wrap(data * x.data); }
+ VectorType operator*(T x) const { return wrap(data * x); }
+ friend VectorType operator*(T x, VectorType y) { return wrap(x * y.data); }
+ VectorType operator/(VectorType x) const { return wrap(data / x.data); }
+ VectorType operator/(T x) const { return wrap(data / x); }
+ friend VectorType operator/(T x, VectorType y) { return wrap(x / y.data); }
+ VectorType operator%(int x) const { return wrap(data % x); }
+
+ VectorType& operator&=(VectorType x) {
+ data &= x.data;
+ return *this;
+ }
+ VectorType& operator|=(VectorType x) {
+ data |= x.data;
+ return *this;
+ }
+ VectorType& operator^=(VectorType x) {
+ data ^= x.data;
+ return *this;
+ }
+ VectorType& operator<<=(int x) {
+ data <<= x;
+ return *this;
+ }
+ VectorType& operator>>=(int x) {
+ data >>= x;
+ return *this;
+ }
+ VectorType& operator+=(VectorType x) {
+ data += x.data;
+ return *this;
+ }
+ VectorType& operator-=(VectorType x) {
+ data -= x.data;
+ return *this;
+ }
+ VectorType& operator*=(VectorType x) {
+ data *= x.data;
+ return *this;
+ }
+ VectorType& operator/=(VectorType x) {
+ data /= x.data;
+ return *this;
+ }
+ VectorType& operator%=(int x) {
+ data %= x;
+ return *this;
+ }
+
+ VectorType<mask_type, N> operator==(VectorType x) const {
+ return VectorType<mask_type, N>::wrap(data == x.data);
+ }
+ VectorType<mask_type, N> operator!=(VectorType x) const {
+ return VectorType<mask_type, N>::wrap(data != x.data);
+ }
+ VectorType<mask_type, N> operator<(VectorType x) const {
+ return VectorType<mask_type, N>::wrap(data < x.data);
+ }
+ VectorType<mask_type, N> operator>(VectorType x) const {
+ return VectorType<mask_type, N>::wrap(data > x.data);
+ }
+ VectorType<mask_type, N> operator<=(VectorType x) const {
+ return VectorType<mask_type, N>::wrap(data <= x.data);
+ }
+ VectorType<mask_type, N> operator>=(VectorType x) const {
+ return VectorType<mask_type, N>::wrap(data >= x.data);
+ }
+
+ VectorType operator!() const { return wrap(!data); }
+ VectorType operator&&(VectorType x) const { return wrap(data & x.data); }
+ VectorType operator||(VectorType x) const { return wrap(data | x.data); }
+
+ VectorType& operator=(VectorType x) {
+ data = x.data;
+ return *this;
+ }
+
+ VectorType<T, 4> shuffle(VectorType b, mask_index x, mask_index y,
+ mask_index z, mask_index w) const {
+ return VectorType<T, 4>::wrap(__builtin_shuffle(
+ data, b.data, (typename VectorType<T, 4>::mask_type){x, y, z, w}));
+ }
+ VectorType<T, 8> shuffle(VectorType b, mask_index x, mask_index y,
+ mask_index z, mask_index w, mask_index s,
+ mask_index t, mask_index u, mask_index v) const {
+ return VectorType<T, 8>::wrap(__builtin_shuffle(
+ data, b.data,
+ (typename VectorType<T, 8>::mask_type){x, y, z, w, s, t, u, v}));
+ }
+ VectorType<T, 16> shuffle(VectorType b, mask_index x, mask_index y,
+ mask_index z, mask_index w, mask_index s,
+ mask_index t, mask_index u, mask_index v,
+ mask_index i, mask_index j, mask_index k,
+ mask_index l, mask_index m, mask_index n,
+ mask_index o, mask_index p) const {
+ return VectorType<T, 16>::wrap(
+ __builtin_shuffle(data, b.data,
+ (typename VectorType<T, 16>::mask_type){
+ x, y, z, w, s, t, u, v, i, j, k, l, m, n, o, p}));
+ }
+
+ VectorType<T, 4> swizzle(mask_index x, mask_index y, mask_index z,
+ mask_index w) const {
+ return VectorType<T, 4>::wrap(__builtin_shuffle(
+ data, (typename VectorType<T, 4>::mask_type){x, y, z, w}));
+ }
+ VectorType<T, 8> swizzle(mask_index x, mask_index y, mask_index z,
+ mask_index w, mask_index s, mask_index t,
+ mask_index u, mask_index v) const {
+ return VectorType<T, 8>::wrap(__builtin_shuffle(
+ data, (typename VectorType<T, 8>::mask_type){x, y, z, w, s, t, u, v}));
+ }
+
+ SI VectorType wrap(half_type low, half_type high) {
+ VectorType v;
+ v.low_half = low;
+ v.high_half = high;
+ return v;
+ }
+
+ VectorType<T, N * 2> combine(VectorType high) const {
+ return VectorType<T, N * 2>::wrap(data, high.data);
+ }
+
+# define xyxy swizzle(0, 1, 0, 1)
+# define zwzw swizzle(2, 3, 2, 3)
+# define zwxy swizzle(2, 3, 0, 1)
+# define zyxw swizzle(2, 1, 0, 3)
+# define xyzz swizzle(0, 1, 2, 2)
+# define yzwx swizzle(1, 2, 3, 0)
+# define wxyz swizzle(3, 0, 1, 2)
+# define xxxxyyyy XXXXYYYY()
+ VectorType<T, 8> XXXXYYYY() const {
+ return swizzle(0, 0, 0, 0).combine(swizzle(1, 1, 1, 1));
+ }
+# define zzzzwwww ZZZZWWWW()
+ VectorType<T, 8> ZZZZWWWW() const {
+ return swizzle(2, 2, 2, 2).combine(swizzle(3, 3, 3, 3));
+ }
+# define xyzwxyzw XYZWXYZW()
+ VectorType<T, 8> XYZWXYZW() const { return combine(*this); }
+# define xyxyxyxy XYXYXYXY()
+ VectorType<T, 8> XYXYXYXY() const {
+ return swizzle(0, 1, 0, 1).combine(swizzle(0, 1, 0, 1));
+ }
+# define zwzwzwzw ZWZWZWZW()
+ VectorType<T, 8> ZWZWZWZW() const {
+ return swizzle(2, 3, 2, 3).combine(swizzle(2, 3, 2, 3));
+ }
+# define xxyyzzww XXYYZZWW()
+ VectorType<T, 8> XXYYZZWW() const {
+ return swizzle(0, 0, 1, 1).combine(swizzle(2, 2, 3, 3));
+ }
+};
+
+template <typename T>
+struct VectorType<T, 2> {
+ typedef T data_type __attribute__((vector_size(sizeof(T) * 2)));
+ union {
+ data_type data;
+ struct {
+ T x, y;
+ };
+ T elements[2];
+ };
+
+ SI VectorType wrap(const data_type& data) {
+ VectorType v;
+ v.data = data;
+ return v;
+ }
+};
+
+# define CONVERT(vector, type) ((type)(vector))
+# define SHUFFLE(a, b, ...) a.shuffle(b, __VA_ARGS__)
+
+template <typename T, int N>
+SI VectorType<T, N * 2> combine(VectorType<T, N> a, VectorType<T, N> b) {
+ return VectorType<T, N * 2>::wrap(a.data, b.data);
+}
+
+template <typename T, int N>
+SI VectorType<T, N / 2> lowHalf(VectorType<T, N> a) {
+ return VectorType<T, N / 2>::wrap(a.low_half);
+}
+
+template <typename T, int N>
+SI VectorType<T, N / 2> highHalf(VectorType<T, N> a) {
+ return VectorType<T, N / 2>::wrap(a.high_half);
+}
+
+template <typename T, int N>
+SI VectorType<T, N * 2> expand(VectorType<T, N> a) {
+ return combine(a, a);
+}
+#endif
+
+template <typename T>
+SI VectorType<T, 4> zipLow(VectorType<T, 4> a, VectorType<T, 4> b) {
+ return SHUFFLE(a, b, 0, 4, 1, 5);
+}
+
+template <typename T>
+SI VectorType<T, 4> zipHigh(VectorType<T, 4> a, VectorType<T, 4> b) {
+ return SHUFFLE(a, b, 2, 6, 3, 7);
+}
+
+template <typename T>
+SI VectorType<T, 8> zipLow(VectorType<T, 8> a, VectorType<T, 8> b) {
+ return SHUFFLE(a, b, 0, 8, 1, 9, 2, 10, 3, 11);
+}
+
+template <typename T>
+SI VectorType<T, 8> zipHigh(VectorType<T, 8> a, VectorType<T, 8> b) {
+ return SHUFFLE(a, b, 4, 12, 5, 13, 6, 14, 7, 15);
+}
+
+template <typename T>
+SI VectorType<T, 16> zipLow(VectorType<T, 16> a, VectorType<T, 16> b) {
+ return SHUFFLE(a, b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23);
+}
+
+template <typename T>
+SI VectorType<T, 16> zipHigh(VectorType<T, 16> a, VectorType<T, 16> b) {
+ return SHUFFLE(a, b, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30,
+ 31);
+}
+
+template <typename T>
+SI VectorType<T, 8> zip2Low(VectorType<T, 8> a, VectorType<T, 8> b) {
+ return SHUFFLE(a, b, 0, 1, 8, 9, 2, 3, 10, 11);
+}
+
+template <typename T>
+SI VectorType<T, 8> zip2High(VectorType<T, 8> a, VectorType<T, 8> b) {
+ return SHUFFLE(a, b, 4, 5, 12, 13, 6, 7, 14, 15);
+}
+
+#ifdef __clang__
+template <typename T>
+SI VectorType<T, 8> zip(VectorType<T, 4> a, VectorType<T, 4> b) {
+ return SHUFFLE(a, b, 0, 4, 1, 5, 2, 6, 3, 7);
+}
+
+template <typename T>
+SI VectorType<T, 16> zip(VectorType<T, 8> a, VectorType<T, 8> b) {
+ return SHUFFLE(a, b, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
+}
+#else
+template <typename T, int N>
+SI VectorType<T, N * 2> zip(VectorType<T, N> a, VectorType<T, N> b) {
+ return combine(zipLow(a, b), zipHigh(a, b));
+}
+#endif
+
+template <typename T>
+struct Unaligned {
+ template <typename P>
+ SI T load(const P* p) {
+ T v;
+ memcpy(&v, p, sizeof(v));
+ return v;
+ }
+
+ template <typename P>
+ SI void store(P* p, T v) {
+ memcpy(p, &v, sizeof(v));
+ }
+};
+
+#ifndef __clang__
+template <typename T, int N>
+struct Unaligned<VectorType<T, N>> {
+ template <typename P>
+ SI VectorType<T, N> load(const P* p) {
+ VectorType<T, N> v;
+ memcpy(v.elements, p, sizeof(v));
+ return v;
+ }
+
+ template <typename P>
+ SI void store(P* p, VectorType<T, N> v) {
+ memcpy(p, v.elements, sizeof(v));
+ }
+};
+#endif
+
+template <typename T, typename P>
+SI T unaligned_load(const P* p) {
+ return Unaligned<T>::load(p);
+}
+
+template <typename T, typename P>
+SI void unaligned_store(P* p, T v) {
+ Unaligned<T>::store(p, v);
+}
+
+template <typename D, typename S>
+SI D bit_cast(const S& src) {
+ static_assert(sizeof(D) == sizeof(S), "");
+ return unaligned_load<D>(&src);
+}
+
+template <typename T>
+using V2 = VectorType<T, 2>;
+template <typename T>
+using V4 = VectorType<T, 4>;
+using Float = V4<float>;
+using I32 = V4<int32_t>;
+using I16 = V4<int16_t>;
+using U64 = V4<uint64_t>;
+using U32 = V4<uint32_t>;
+using U16 = V4<uint16_t>;
+using U8 = V4<uint8_t>;
+using Bool = V4<int>;
+template <typename T>
+using V8 = VectorType<T, 8>;
+template <typename T>
+using V16 = VectorType<T, 16>;
+
+} // namespace glsl