Adding upstream version 115.7.0esr.upstream/115.7.0esr

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
commit: 36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree: 105e8c98ddea1c1e4784a60a5a6410fa416be2de /gfx/skia/skia/modules/skcms
parent: Initial commit. (diff)
download: firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
7 files changed, 5192 insertions, 0 deletions
diff --git a/gfx/skia/skia/modules/skcms/README.chromium b/gfx/skia/skia/modules/skcms/README.chromium
new file mode 100644
index 0000000000..046f6b1d19
--- /dev/null
+++ b/gfx/skia/skia/modules/skcms/README.chromium
@@ -0,0 +1,5 @@
+Name: skcms
+URL: https://skia.org/
+Version: unknown
+Security Critical: yes
+License: BSD
diff --git a/gfx/skia/skia/modules/skcms/skcms.cc b/gfx/skia/skia/modules/skcms/skcms.cc
new file mode 100644
index 0000000000..246c08af94
--- /dev/null
+++ b/gfx/skia/skia/modules/skcms/skcms.cc
@@ -0,0 +1,3064 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "skcms.h"
+#include "skcms_internal.h"
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(__ARM_NEON)
+    #include <arm_neon.h>
+#elif defined(__SSE__)
+    #include <immintrin.h>
+
+    #if defined(__clang__)
+        // That #include <immintrin.h> is usually enough, but Clang's headers
+        // "helpfully" skip including the whole kitchen sink when _MSC_VER is
+        // defined, because lots of programs on Windows would include that and
+        // it'd be a lot slower.  But we want all those headers included so we
+        // can use their features after runtime checks later.
+        #include <smmintrin.h>
+        #include <avxintrin.h>
+        #include <avx2intrin.h>
+        #include <avx512fintrin.h>
+        #include <avx512dqintrin.h>
+    #endif
+#endif
+
+static bool runtime_cpu_detection = true;
+void skcms_DisableRuntimeCPUDetection() {
+    runtime_cpu_detection = false;
+}
+
+// sizeof(x) will return size_t, which is 32-bit on some machines and 64-bit on others.
+// We have better testing on 64-bit machines, so force 32-bit machines to behave like 64-bit.
+//
+// Please do not use sizeof() directly, and size_t only when required.
+// (We have no way of enforcing these requests...)
+#define SAFE_SIZEOF(x) ((uint64_t)sizeof(x))
+
+// Same sort of thing for _Layout structs with a variable sized array at the end (named "variable").
+#define SAFE_FIXED_SIZE(type) ((uint64_t)offsetof(type, variable))
+
+static const union {
+    uint32_t bits;
+    float    f;
+} inf_ = { 0x7f800000 };
+#define INFINITY_ inf_.f
+
+#if defined(__clang__) || defined(__GNUC__)
+    #define small_memcpy __builtin_memcpy
+#else
+    #define small_memcpy memcpy
+#endif
+
+static float log2f_(float x) {
+    // The first approximation of log2(x) is its exponent 'e', minus 127.
+    int32_t bits;
+    small_memcpy(&bits, &x, sizeof(bits));
+
+    float e = (float)bits * (1.0f / (1<<23));
+
+    // If we use the mantissa too we can refine the error signficantly.
+    int32_t m_bits = (bits & 0x007fffff) | 0x3f000000;
+    float m;
+    small_memcpy(&m, &m_bits, sizeof(m));
+
+    return (e - 124.225514990f
+              -   1.498030302f*m
+              -   1.725879990f/(0.3520887068f + m));
+}
+static float logf_(float x) {
+    const float ln2 = 0.69314718f;
+    return ln2*log2f_(x);
+}
+
+static float exp2f_(float x) {
+    float fract = x - floorf_(x);
+
+    float fbits = (1.0f * (1<<23)) * (x + 121.274057500f
+                                        -   1.490129070f*fract
+                                        +  27.728023300f/(4.84252568f - fract));
+
+    // Before we cast fbits to int32_t, check for out of range values to pacify UBSAN.
+    // INT_MAX is not exactly representable as a float, so exclude it as effectively infinite.
+    // Negative values are effectively underflow - we'll end up returning a (different) negative
+    // value, which makes no sense. So clamp to zero.
+    if (fbits >= (float)INT_MAX) {
+        return INFINITY_;
+    } else if (fbits < 0) {
+        return 0;
+    }
+
+    int32_t bits = (int32_t)fbits;
+    small_memcpy(&x, &bits, sizeof(x));
+    return x;
+}
+
+// Not static, as it's used by some test tools.
+float powf_(float x, float y) {
+    assert (x >= 0);
+    return (x == 0) || (x == 1) ? x
+                                : exp2f_(log2f_(x) * y);
+}
+
+static float expf_(float x) {
+    const float log2_e = 1.4426950408889634074f;
+    return exp2f_(log2_e * x);
+}
+
+static float fmaxf_(float x, float y) { return x > y ? x : y; }
+static float fminf_(float x, float y) { return x < y ? x : y; }
+
+static bool isfinitef_(float x) { return 0 == x*0; }
+
+static float minus_1_ulp(float x) {
+    int32_t bits;
+    memcpy(&bits, &x, sizeof(bits));
+    bits = bits - 1;
+    memcpy(&x, &bits, sizeof(bits));
+    return x;
+}
+
+// Most transfer functions we work with are sRGBish.
+// For exotic HDR transfer functions, we encode them using a tf.g that makes no sense,
+// and repurpose the other fields to hold the parameters of the HDR functions.
+struct TF_PQish  { float A,B,C,D,E,F; };
+struct TF_HLGish { float R,G,a,b,c,K_minus_1; };
+// We didn't originally support a scale factor K for HLG, and instead just stored 0 in
+// the unused `f` field of skcms_TransferFunction for HLGish and HLGInvish transfer functions.
+// By storing f=K-1, those old unusued f=0 values now mean K=1, a noop scale factor.
+
+static float TFKind_marker(skcms_TFType kind) {
+    // We'd use different NaNs, but those aren't guaranteed to be preserved by WASM.
+    return -(float)kind;
+}
+
+static skcms_TFType classify(const skcms_TransferFunction& tf, TF_PQish*   pq = nullptr
+                                                             , TF_HLGish* hlg = nullptr) {
+    if (tf.g < 0 && static_cast<float>(static_cast<int>(tf.g)) == tf.g) {
+        // TODO: soundness checks for PQ/HLG like we do for sRGBish?
+        switch ((int)tf.g) {
+            case -skcms_TFType_PQish:
+                if (pq) {
+                    memcpy(pq , &tf.a, sizeof(*pq ));
+                }
+                return skcms_TFType_PQish;
+            case -skcms_TFType_HLGish:
+                if (hlg) {
+                    memcpy(hlg, &tf.a, sizeof(*hlg));
+                }
+                return skcms_TFType_HLGish;
+            case -skcms_TFType_HLGinvish:
+                if (hlg) {
+                    memcpy(hlg, &tf.a, sizeof(*hlg));
+                }
+                return skcms_TFType_HLGinvish;
+        }
+        return skcms_TFType_Invalid;
+    }
+
+    // Basic soundness checks for sRGBish transfer functions.
+    if (isfinitef_(tf.a + tf.b + tf.c + tf.d + tf.e + tf.f + tf.g)
+            // a,c,d,g should be non-negative to make any sense.
+            && tf.a >= 0
+            && tf.c >= 0
+            && tf.d >= 0
+            && tf.g >= 0
+            // Raising a negative value to a fractional tf->g produces complex numbers.
+            && tf.a * tf.d + tf.b >= 0) {
+        return skcms_TFType_sRGBish;
+    }
+
+    return skcms_TFType_Invalid;
+}
+
+skcms_TFType skcms_TransferFunction_getType(const skcms_TransferFunction* tf) {
+    return classify(*tf);
+}
+bool skcms_TransferFunction_isSRGBish(const skcms_TransferFunction* tf) {
+    return classify(*tf) == skcms_TFType_sRGBish;
+}
+bool skcms_TransferFunction_isPQish(const skcms_TransferFunction* tf) {
+    return classify(*tf) == skcms_TFType_PQish;
+}
+bool skcms_TransferFunction_isHLGish(const skcms_TransferFunction* tf) {
+    return classify(*tf) == skcms_TFType_HLGish;
+}
+
+bool skcms_TransferFunction_makePQish(skcms_TransferFunction* tf,
+                                      float A, float B, float C,
+                                      float D, float E, float F) {
+    *tf = { TFKind_marker(skcms_TFType_PQish), A,B,C,D,E,F };
+    assert(skcms_TransferFunction_isPQish(tf));
+    return true;
+}
+
+bool skcms_TransferFunction_makeScaledHLGish(skcms_TransferFunction* tf,
+                                             float K, float R, float G,
+                                             float a, float b, float c) {
+    *tf = { TFKind_marker(skcms_TFType_HLGish), R,G, a,b,c, K-1.0f };
+    assert(skcms_TransferFunction_isHLGish(tf));
+    return true;
+}
+
+float skcms_TransferFunction_eval(const skcms_TransferFunction* tf, float x) {
+    float sign = x < 0 ? -1.0f : 1.0f;
+    x *= sign;
+
+    TF_PQish  pq;
+    TF_HLGish hlg;
+    switch (classify(*tf, &pq, &hlg)) {
+        case skcms_TFType_Invalid: break;
+
+        case skcms_TFType_HLGish: {
+            const float K = hlg.K_minus_1 + 1.0f;
+            return K * sign * (x*hlg.R <= 1 ? powf_(x*hlg.R, hlg.G)
+                                            : expf_((x-hlg.c)*hlg.a) + hlg.b);
+        }
+
+        // skcms_TransferFunction_invert() inverts R, G, and a for HLGinvish so this math is fast.
+        case skcms_TFType_HLGinvish: {
+            const float K = hlg.K_minus_1 + 1.0f;
+            x /= K;
+            return sign * (x <= 1 ? hlg.R * powf_(x, hlg.G)
+                                  : hlg.a * logf_(x - hlg.b) + hlg.c);
+        }
+
+        case skcms_TFType_sRGBish:
+            return sign * (x < tf->d ?       tf->c * x + tf->f
+                                     : powf_(tf->a * x + tf->b, tf->g) + tf->e);
+
+        case skcms_TFType_PQish: return sign * powf_(fmaxf_(pq.A + pq.B * powf_(x, pq.C), 0)
+                                                         / (pq.D + pq.E * powf_(x, pq.C)),
+                                                     pq.F);
+    }
+    return 0;
+}
+
+
+static float eval_curve(const skcms_Curve* curve, float x) {
+    if (curve->table_entries == 0) {
+        return skcms_TransferFunction_eval(&curve->parametric, x);
+    }
+
+    float ix = fmaxf_(0, fminf_(x, 1)) * static_cast<float>(curve->table_entries - 1);
+    int   lo = (int)                   ix        ,
+          hi = (int)(float)minus_1_ulp(ix + 1.0f);
+    float t = ix - (float)lo;
+
+    float l, h;
+    if (curve->table_8) {
+        l = curve->table_8[lo] * (1/255.0f);
+        h = curve->table_8[hi] * (1/255.0f);
+    } else {
+        uint16_t be_l, be_h;
+        memcpy(&be_l, curve->table_16 + 2*lo, 2);
+        memcpy(&be_h, curve->table_16 + 2*hi, 2);
+        uint16_t le_l = ((be_l << 8) | (be_l >> 8)) & 0xffff;
+        uint16_t le_h = ((be_h << 8) | (be_h >> 8)) & 0xffff;
+        l = le_l * (1/65535.0f);
+        h = le_h * (1/65535.0f);
+    }
+    return l + (h-l)*t;
+}
+
+float skcms_MaxRoundtripError(const skcms_Curve* curve, const skcms_TransferFunction* inv_tf) {
+    uint32_t N = curve->table_entries > 256 ? curve->table_entries : 256;
+    const float dx = 1.0f / static_cast<float>(N - 1);
+    float err = 0;
+    for (uint32_t i = 0; i < N; i++) {
+        float x = static_cast<float>(i) * dx,
+              y = eval_curve(curve, x);
+        err = fmaxf_(err, fabsf_(x - skcms_TransferFunction_eval(inv_tf, y)));
+    }
+    return err;
+}
+
+bool skcms_AreApproximateInverses(const skcms_Curve* curve, const skcms_TransferFunction* inv_tf) {
+    return skcms_MaxRoundtripError(curve, inv_tf) < (1/512.0f);
+}
+
+// Additional ICC signature values that are only used internally
+enum {
+    // File signature
+    skcms_Signature_acsp = 0x61637370,
+
+    // Tag signatures
+    skcms_Signature_rTRC = 0x72545243,
+    skcms_Signature_gTRC = 0x67545243,
+    skcms_Signature_bTRC = 0x62545243,
+    skcms_Signature_kTRC = 0x6B545243,
+
+    skcms_Signature_rXYZ = 0x7258595A,
+    skcms_Signature_gXYZ = 0x6758595A,
+    skcms_Signature_bXYZ = 0x6258595A,
+
+    skcms_Signature_A2B0 = 0x41324230,
+    skcms_Signature_B2A0 = 0x42324130,
+
+    skcms_Signature_CHAD = 0x63686164,
+    skcms_Signature_WTPT = 0x77747074,
+
+    skcms_Signature_CICP = 0x63696370,
+
+    // Type signatures
+    skcms_Signature_curv = 0x63757276,
+    skcms_Signature_mft1 = 0x6D667431,
+    skcms_Signature_mft2 = 0x6D667432,
+    skcms_Signature_mAB  = 0x6D414220,
+    skcms_Signature_mBA  = 0x6D424120,
+    skcms_Signature_para = 0x70617261,
+    skcms_Signature_sf32 = 0x73663332,
+    // XYZ is also a PCS signature, so it's defined in skcms.h
+    // skcms_Signature_XYZ = 0x58595A20,
+};
+
+static uint16_t read_big_u16(const uint8_t* ptr) {
+    uint16_t be;
+    memcpy(&be, ptr, sizeof(be));
+#if defined(_MSC_VER)
+    return _byteswap_ushort(be);
+#else
+    return __builtin_bswap16(be);
+#endif
+}
+
+static uint32_t read_big_u32(const uint8_t* ptr) {
+    uint32_t be;
+    memcpy(&be, ptr, sizeof(be));
+#if defined(_MSC_VER)
+    return _byteswap_ulong(be);
+#else
+    return __builtin_bswap32(be);
+#endif
+}
+
+static int32_t read_big_i32(const uint8_t* ptr) {
+    return (int32_t)read_big_u32(ptr);
+}
+
+static float read_big_fixed(const uint8_t* ptr) {
+    return static_cast<float>(read_big_i32(ptr)) * (1.0f / 65536.0f);
+}
+
+// Maps to an in-memory profile so that fields line up to the locations specified
+// in ICC.1:2010, section 7.2
+typedef struct {
+    uint8_t size                [ 4];
+    uint8_t cmm_type            [ 4];
+    uint8_t version             [ 4];
+    uint8_t profile_class       [ 4];
+    uint8_t data_color_space    [ 4];
+    uint8_t pcs                 [ 4];
+    uint8_t creation_date_time  [12];
+    uint8_t signature           [ 4];
+    uint8_t platform            [ 4];
+    uint8_t flags               [ 4];
+    uint8_t device_manufacturer [ 4];
+    uint8_t device_model        [ 4];
+    uint8_t device_attributes   [ 8];
+    uint8_t rendering_intent    [ 4];
+    uint8_t illuminant_X        [ 4];
+    uint8_t illuminant_Y        [ 4];
+    uint8_t illuminant_Z        [ 4];
+    uint8_t creator             [ 4];
+    uint8_t profile_id          [16];
+    uint8_t reserved            [28];
+    uint8_t tag_count           [ 4]; // Technically not part of header, but required
+} header_Layout;
+
+typedef struct {
+    uint8_t signature [4];
+    uint8_t offset    [4];
+    uint8_t size      [4];
+} tag_Layout;
+
+static const tag_Layout* get_tag_table(const skcms_ICCProfile* profile) {
+    return (const tag_Layout*)(profile->buffer + SAFE_SIZEOF(header_Layout));
+}
+
+// s15Fixed16ArrayType is technically variable sized, holding N values. However, the only valid
+// use of the type is for the CHAD tag that stores exactly nine values.
+typedef struct {
+    uint8_t type     [ 4];
+    uint8_t reserved [ 4];
+    uint8_t values   [36];
+} sf32_Layout;
+
+bool skcms_GetCHAD(const skcms_ICCProfile* profile, skcms_Matrix3x3* m) {
+    skcms_ICCTag tag;
+    if (!skcms_GetTagBySignature(profile, skcms_Signature_CHAD, &tag)) {
+        return false;
+    }
+
+    if (tag.type != skcms_Signature_sf32 || tag.size < SAFE_SIZEOF(sf32_Layout)) {
+        return false;
+    }
+
+    const sf32_Layout* sf32Tag = (const sf32_Layout*)tag.buf;
+    const uint8_t* values = sf32Tag->values;
+    for (int r = 0; r < 3; ++r)
+    for (int c = 0; c < 3; ++c, values += 4) {
+        m->vals[r][c] = read_big_fixed(values);
+    }
+    return true;
+}
+
+// XYZType is technically variable sized, holding N XYZ triples. However, the only valid uses of
+// the type are for tags/data that store exactly one triple.
+typedef struct {
+    uint8_t type     [4];
+    uint8_t reserved [4];
+    uint8_t X        [4];
+    uint8_t Y        [4];
+    uint8_t Z        [4];
+} XYZ_Layout;
+
+static bool read_tag_xyz(const skcms_ICCTag* tag, float* x, float* y, float* z) {
+    if (tag->type != skcms_Signature_XYZ || tag->size < SAFE_SIZEOF(XYZ_Layout)) {
+        return false;
+    }
+
+    const XYZ_Layout* xyzTag = (const XYZ_Layout*)tag->buf;
+
+    *x = read_big_fixed(xyzTag->X);
+    *y = read_big_fixed(xyzTag->Y);
+    *z = read_big_fixed(xyzTag->Z);
+    return true;
+}
+
+bool skcms_GetWTPT(const skcms_ICCProfile* profile, float xyz[3]) {
+    skcms_ICCTag tag;
+    return skcms_GetTagBySignature(profile, skcms_Signature_WTPT, &tag) &&
+           read_tag_xyz(&tag, &xyz[0], &xyz[1], &xyz[2]);
+}
+
+static bool read_to_XYZD50(const skcms_ICCTag* rXYZ, const skcms_ICCTag* gXYZ,
+                           const skcms_ICCTag* bXYZ, skcms_Matrix3x3* toXYZ) {
+    return read_tag_xyz(rXYZ, &toXYZ->vals[0][0], &toXYZ->vals[1][0], &toXYZ->vals[2][0]) &&
+           read_tag_xyz(gXYZ, &toXYZ->vals[0][1], &toXYZ->vals[1][1], &toXYZ->vals[2][1]) &&
+           read_tag_xyz(bXYZ, &toXYZ->vals[0][2], &toXYZ->vals[1][2], &toXYZ->vals[2][2]);
+}
+
+typedef struct {
+    uint8_t type          [4];
+    uint8_t reserved_a    [4];
+    uint8_t function_type [2];
+    uint8_t reserved_b    [2];
+    uint8_t variable      [1/*variable*/];  // 1, 3, 4, 5, or 7 s15.16, depending on function_type
+} para_Layout;
+
+static bool read_curve_para(const uint8_t* buf, uint32_t size,
+                            skcms_Curve* curve, uint32_t* curve_size) {
+    if (size < SAFE_FIXED_SIZE(para_Layout)) {
+        return false;
+    }
+
+    const para_Layout* paraTag = (const para_Layout*)buf;
+
+    enum { kG = 0, kGAB = 1, kGABC = 2, kGABCD = 3, kGABCDEF = 4 };
+    uint16_t function_type = read_big_u16(paraTag->function_type);
+    if (function_type > kGABCDEF) {
+        return false;
+    }
+
+    static const uint32_t curve_bytes[] = { 4, 12, 16, 20, 28 };
+    if (size < SAFE_FIXED_SIZE(para_Layout) + curve_bytes[function_type]) {
+        return false;
+    }
+
+    if (curve_size) {
+        *curve_size = SAFE_FIXED_SIZE(para_Layout) + curve_bytes[function_type];
+    }
+
+    curve->table_entries = 0;
+    curve->parametric.a  = 1.0f;
+    curve->parametric.b  = 0.0f;
+    curve->parametric.c  = 0.0f;
+    curve->parametric.d  = 0.0f;
+    curve->parametric.e  = 0.0f;
+    curve->parametric.f  = 0.0f;
+    curve->parametric.g  = read_big_fixed(paraTag->variable);
+
+    switch (function_type) {
+        case kGAB:
+            curve->parametric.a = read_big_fixed(paraTag->variable + 4);
+            curve->parametric.b = read_big_fixed(paraTag->variable + 8);
+            if (curve->parametric.a == 0) {
+                return false;
+            }
+            curve->parametric.d = -curve->parametric.b / curve->parametric.a;
+            break;
+        case kGABC:
+            curve->parametric.a = read_big_fixed(paraTag->variable + 4);
+            curve->parametric.b = read_big_fixed(paraTag->variable + 8);
+            curve->parametric.e = read_big_fixed(paraTag->variable + 12);
+            if (curve->parametric.a == 0) {
+                return false;
+            }
+            curve->parametric.d = -curve->parametric.b / curve->parametric.a;
+            curve->parametric.f = curve->parametric.e;
+            break;
+        case kGABCD:
+            curve->parametric.a = read_big_fixed(paraTag->variable + 4);
+            curve->parametric.b = read_big_fixed(paraTag->variable + 8);
+            curve->parametric.c = read_big_fixed(paraTag->variable + 12);
+            curve->parametric.d = read_big_fixed(paraTag->variable + 16);
+            break;
+        case kGABCDEF:
+            curve->parametric.a = read_big_fixed(paraTag->variable + 4);
+            curve->parametric.b = read_big_fixed(paraTag->variable + 8);
+            curve->parametric.c = read_big_fixed(paraTag->variable + 12);
+            curve->parametric.d = read_big_fixed(paraTag->variable + 16);
+            curve->parametric.e = read_big_fixed(paraTag->variable + 20);
+            curve->parametric.f = read_big_fixed(paraTag->variable + 24);
+            break;
+    }
+    return skcms_TransferFunction_isSRGBish(&curve->parametric);
+}
+
+typedef struct {
+    uint8_t type          [4];
+    uint8_t reserved      [4];
+    uint8_t value_count   [4];
+    uint8_t variable      [1/*variable*/];  // value_count, 8.8 if 1, uint16 (n*65535) if > 1
+} curv_Layout;
+
+static bool read_curve_curv(const uint8_t* buf, uint32_t size,
+                            skcms_Curve* curve, uint32_t* curve_size) {
+    if (size < SAFE_FIXED_SIZE(curv_Layout)) {
+        return false;
+    }
+
+    const curv_Layout* curvTag = (const curv_Layout*)buf;
+
+    uint32_t value_count = read_big_u32(curvTag->value_count);
+    if (size < SAFE_FIXED_SIZE(curv_Layout) + value_count * SAFE_SIZEOF(uint16_t)) {
+        return false;
+    }
+
+    if (curve_size) {
+        *curve_size = SAFE_FIXED_SIZE(curv_Layout) + value_count * SAFE_SIZEOF(uint16_t);
+    }
+
+    if (value_count < 2) {
+        curve->table_entries = 0;
+        curve->parametric.a  = 1.0f;
+        curve->parametric.b  = 0.0f;
+        curve->parametric.c  = 0.0f;
+        curve->parametric.d  = 0.0f;
+        curve->parametric.e  = 0.0f;
+        curve->parametric.f  = 0.0f;
+        if (value_count == 0) {
+            // Empty tables are a shorthand for an identity curve
+            curve->parametric.g = 1.0f;
+        } else {
+            // Single entry tables are a shorthand for simple gamma
+            curve->parametric.g = read_big_u16(curvTag->variable) * (1.0f / 256.0f);
+        }
+    } else {
+        curve->table_8       = nullptr;
+        curve->table_16      = curvTag->variable;
+        curve->table_entries = value_count;
+    }
+
+    return true;
+}
+
+// Parses both curveType and parametricCurveType data. Ensures that at most 'size' bytes are read.
+// If curve_size is not nullptr, writes the number of bytes used by the curve in (*curve_size).
+static bool read_curve(const uint8_t* buf, uint32_t size,
+                       skcms_Curve* curve, uint32_t* curve_size) {
+    if (!buf || size < 4 || !curve) {
+        return false;
+    }
+
+    uint32_t type = read_big_u32(buf);
+    if (type == skcms_Signature_para) {
+        return read_curve_para(buf, size, curve, curve_size);
+    } else if (type == skcms_Signature_curv) {
+        return read_curve_curv(buf, size, curve, curve_size);
+    }
+
+    return false;
+}
+
+// mft1 and mft2 share a large chunk of data
+typedef struct {
+    uint8_t type                 [ 4];
+    uint8_t reserved_a           [ 4];
+    uint8_t input_channels       [ 1];
+    uint8_t output_channels      [ 1];
+    uint8_t grid_points          [ 1];
+    uint8_t reserved_b           [ 1];
+    uint8_t matrix               [36];
+} mft_CommonLayout;
+
+typedef struct {
+    mft_CommonLayout common      [1];
+
+    uint8_t variable             [1/*variable*/];
+} mft1_Layout;
+
+typedef struct {
+    mft_CommonLayout common      [1];
+
+    uint8_t input_table_entries  [2];
+    uint8_t output_table_entries [2];
+    uint8_t variable             [1/*variable*/];
+} mft2_Layout;
+
+static bool read_mft_common(const mft_CommonLayout* mftTag, skcms_A2B* a2b) {
+    // MFT matrices are applied before the first set of curves, but must be identity unless the
+    // input is PCSXYZ. We don't support PCSXYZ profiles, so we ignore this matrix. Note that the
+    // matrix in skcms_A2B is applied later in the pipe, so supporting this would require another
+    // field/flag.
+    a2b->matrix_channels = 0;
+    a2b-> input_channels = mftTag-> input_channels[0];
+    a2b->output_channels = mftTag->output_channels[0];
+
+    // We require exactly three (ie XYZ/Lab/RGB) output channels
+    if (a2b->output_channels != ARRAY_COUNT(a2b->output_curves)) {
+        return false;
+    }
+    // We require at least one, and no more than four (ie CMYK) input channels
+    if (a2b->input_channels < 1 || a2b->input_channels > ARRAY_COUNT(a2b->input_curves)) {
+        return false;
+    }
+
+    for (uint32_t i = 0; i < a2b->input_channels; ++i) {
+        a2b->grid_points[i] = mftTag->grid_points[0];
+    }
+    // The grid only makes sense with at least two points along each axis
+    if (a2b->grid_points[0] < 2) {
+        return false;
+    }
+    return true;
+}
+
+// All as the A2B version above, except where noted.
+static bool read_mft_common(const mft_CommonLayout* mftTag, skcms_B2A* b2a) {
+    // Same as A2B.
+    b2a->matrix_channels = 0;
+    b2a-> input_channels = mftTag-> input_channels[0];
+    b2a->output_channels = mftTag->output_channels[0];
+
+
+    // For B2A, exactly 3 input channels (XYZ) and 3 (RGB) or 4 (CMYK) output channels.
+    if (b2a->input_channels != ARRAY_COUNT(b2a->input_curves)) {
+        return false;
+    }
+    if (b2a->output_channels < 3 || b2a->output_channels > ARRAY_COUNT(b2a->output_curves)) {
+        return false;
+    }
+
+    // Same as A2B.
+    for (uint32_t i = 0; i < b2a->input_channels; ++i) {
+        b2a->grid_points[i] = mftTag->grid_points[0];
+    }
+    if (b2a->grid_points[0] < 2) {
+        return false;
+    }
+    return true;
+}
+
+template <typename A2B_or_B2A>
+static bool init_tables(const uint8_t* table_base, uint64_t max_tables_len, uint32_t byte_width,
+                        uint32_t input_table_entries, uint32_t output_table_entries,
+                        A2B_or_B2A* out) {
+    // byte_width is 1 or 2, [input|output]_table_entries are in [2, 4096], so no overflow
+    uint32_t byte_len_per_input_table  = input_table_entries * byte_width;
+    uint32_t byte_len_per_output_table = output_table_entries * byte_width;
+
+    // [input|output]_channels are <= 4, so still no overflow
+    uint32_t byte_len_all_input_tables  = out->input_channels * byte_len_per_input_table;
+    uint32_t byte_len_all_output_tables = out->output_channels * byte_len_per_output_table;
+
+    uint64_t grid_size = out->output_channels * byte_width;
+    for (uint32_t axis = 0; axis < out->input_channels; ++axis) {
+        grid_size *= out->grid_points[axis];
+    }
+
+    if (max_tables_len < byte_len_all_input_tables + grid_size + byte_len_all_output_tables) {
+        return false;
+    }
+
+    for (uint32_t i = 0; i < out->input_channels; ++i) {
+        out->input_curves[i].table_entries = input_table_entries;
+        if (byte_width == 1) {
+            out->input_curves[i].table_8  = table_base + i * byte_len_per_input_table;
+            out->input_curves[i].table_16 = nullptr;
+        } else {
+            out->input_curves[i].table_8  = nullptr;
+            out->input_curves[i].table_16 = table_base + i * byte_len_per_input_table;
+        }
+    }
+
+    if (byte_width == 1) {
+        out->grid_8  = table_base + byte_len_all_input_tables;
+        out->grid_16 = nullptr;
+    } else {
+        out->grid_8  = nullptr;
+        out->grid_16 = table_base + byte_len_all_input_tables;
+    }
+
+    const uint8_t* output_table_base = table_base + byte_len_all_input_tables + grid_size;
+    for (uint32_t i = 0; i < out->output_channels; ++i) {
+        out->output_curves[i].table_entries = output_table_entries;
+        if (byte_width == 1) {
+            out->output_curves[i].table_8  = output_table_base + i * byte_len_per_output_table;
+            out->output_curves[i].table_16 = nullptr;
+        } else {
+            out->output_curves[i].table_8  = nullptr;
+            out->output_curves[i].table_16 = output_table_base + i * byte_len_per_output_table;
+        }
+    }
+
+    return true;
+}
+
+template <typename A2B_or_B2A>
+static bool read_tag_mft1(const skcms_ICCTag* tag, A2B_or_B2A* out) {
+    if (tag->size < SAFE_FIXED_SIZE(mft1_Layout)) {
+        return false;
+    }
+
+    const mft1_Layout* mftTag = (const mft1_Layout*)tag->buf;
+    if (!read_mft_common(mftTag->common, out)) {
+        return false;
+    }
+
+    uint32_t input_table_entries  = 256;
+    uint32_t output_table_entries = 256;
+
+    return init_tables(mftTag->variable, tag->size - SAFE_FIXED_SIZE(mft1_Layout), 1,
+                       input_table_entries, output_table_entries, out);
+}
+
+template <typename A2B_or_B2A>
+static bool read_tag_mft2(const skcms_ICCTag* tag, A2B_or_B2A* out) {
+    if (tag->size < SAFE_FIXED_SIZE(mft2_Layout)) {
+        return false;
+    }
+
+    const mft2_Layout* mftTag = (const mft2_Layout*)tag->buf;
+    if (!read_mft_common(mftTag->common, out)) {
+        return false;
+    }
+
+    uint32_t input_table_entries = read_big_u16(mftTag->input_table_entries);
+    uint32_t output_table_entries = read_big_u16(mftTag->output_table_entries);
+
+    // ICC spec mandates that 2 <= table_entries <= 4096
+    if (input_table_entries < 2 || input_table_entries > 4096 ||
+        output_table_entries < 2 || output_table_entries > 4096) {
+        return false;
+    }
+
+    return init_tables(mftTag->variable, tag->size - SAFE_FIXED_SIZE(mft2_Layout), 2,
+                       input_table_entries, output_table_entries, out);
+}
+
+static bool read_curves(const uint8_t* buf, uint32_t size, uint32_t curve_offset,
+                        uint32_t num_curves, skcms_Curve* curves) {
+    for (uint32_t i = 0; i < num_curves; ++i) {
+        if (curve_offset > size) {
+            return false;
+        }
+
+        uint32_t curve_bytes;
+        if (!read_curve(buf + curve_offset, size - curve_offset, &curves[i], &curve_bytes)) {
+            return false;
+        }
+
+        if (curve_bytes > UINT32_MAX - 3) {
+            return false;
+        }
+        curve_bytes = (curve_bytes + 3) & ~3U;
+
+        uint64_t new_offset_64 = (uint64_t)curve_offset + curve_bytes;
+        curve_offset = (uint32_t)new_offset_64;
+        if (new_offset_64 != curve_offset) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+// mAB and mBA tags use the same encoding, including color lookup tables.
+typedef struct {
+    uint8_t type                 [ 4];
+    uint8_t reserved_a           [ 4];
+    uint8_t input_channels       [ 1];
+    uint8_t output_channels      [ 1];
+    uint8_t reserved_b           [ 2];
+    uint8_t b_curve_offset       [ 4];
+    uint8_t matrix_offset        [ 4];
+    uint8_t m_curve_offset       [ 4];
+    uint8_t clut_offset          [ 4];
+    uint8_t a_curve_offset       [ 4];
+} mAB_or_mBA_Layout;
+
+typedef struct {
+    uint8_t grid_points          [16];
+    uint8_t grid_byte_width      [ 1];
+    uint8_t reserved             [ 3];
+    uint8_t variable             [1/*variable*/];
+} CLUT_Layout;
+
+static bool read_tag_mab(const skcms_ICCTag* tag, skcms_A2B* a2b, bool pcs_is_xyz) {
+    if (tag->size < SAFE_SIZEOF(mAB_or_mBA_Layout)) {
+        return false;
+    }
+
+    const mAB_or_mBA_Layout* mABTag = (const mAB_or_mBA_Layout*)tag->buf;
+
+    a2b->input_channels  = mABTag->input_channels[0];
+    a2b->output_channels = mABTag->output_channels[0];
+
+    // We require exactly three (ie XYZ/Lab/RGB) output channels
+    if (a2b->output_channels != ARRAY_COUNT(a2b->output_curves)) {
+        return false;
+    }
+    // We require no more than four (ie CMYK) input channels
+    if (a2b->input_channels > ARRAY_COUNT(a2b->input_curves)) {
+        return false;
+    }
+
+    uint32_t b_curve_offset = read_big_u32(mABTag->b_curve_offset);
+    uint32_t matrix_offset  = read_big_u32(mABTag->matrix_offset);
+    uint32_t m_curve_offset = read_big_u32(mABTag->m_curve_offset);
+    uint32_t clut_offset    = read_big_u32(mABTag->clut_offset);
+    uint32_t a_curve_offset = read_big_u32(mABTag->a_curve_offset);
+
+    // "B" curves must be present
+    if (0 == b_curve_offset) {
+        return false;
+    }
+
+    if (!read_curves(tag->buf, tag->size, b_curve_offset, a2b->output_channels,
+                     a2b->output_curves)) {
+        return false;
+    }
+
+    // "M" curves and Matrix must be used together
+    if (0 != m_curve_offset) {
+        if (0 == matrix_offset) {
+            return false;
+        }
+        a2b->matrix_channels = a2b->output_channels;
+        if (!read_curves(tag->buf, tag->size, m_curve_offset, a2b->matrix_channels,
+                         a2b->matrix_curves)) {
+            return false;
+        }
+
+        // Read matrix, which is stored as a row-major 3x3, followed by the fourth column
+        if (tag->size < matrix_offset + 12 * SAFE_SIZEOF(uint32_t)) {
+            return false;
+        }
+        float encoding_factor = pcs_is_xyz ? (65535 / 32768.0f) : 1.0f;
+        const uint8_t* mtx_buf = tag->buf + matrix_offset;
+        a2b->matrix.vals[0][0] = encoding_factor * read_big_fixed(mtx_buf +  0);
+        a2b->matrix.vals[0][1] = encoding_factor * read_big_fixed(mtx_buf +  4);
+        a2b->matrix.vals[0][2] = encoding_factor * read_big_fixed(mtx_buf +  8);
+        a2b->matrix.vals[1][0] = encoding_factor * read_big_fixed(mtx_buf + 12);
+        a2b->matrix.vals[1][1] = encoding_factor * read_big_fixed(mtx_buf + 16);
+        a2b->matrix.vals[1][2] = encoding_factor * read_big_fixed(mtx_buf + 20);
+        a2b->matrix.vals[2][0] = encoding_factor * read_big_fixed(mtx_buf + 24);
+        a2b->matrix.vals[2][1] = encoding_factor * read_big_fixed(mtx_buf + 28);
+        a2b->matrix.vals[2][2] = encoding_factor * read_big_fixed(mtx_buf + 32);
+        a2b->matrix.vals[0][3] = encoding_factor * read_big_fixed(mtx_buf + 36);
+        a2b->matrix.vals[1][3] = encoding_factor * read_big_fixed(mtx_buf + 40);
+        a2b->matrix.vals[2][3] = encoding_factor * read_big_fixed(mtx_buf + 44);
+    } else {
+        if (0 != matrix_offset) {
+            return false;
+        }
+        a2b->matrix_channels = 0;
+    }
+
+    // "A" curves and CLUT must be used together
+    if (0 != a_curve_offset) {
+        if (0 == clut_offset) {
+            return false;
+        }
+        if (!read_curves(tag->buf, tag->size, a_curve_offset, a2b->input_channels,
+                         a2b->input_curves)) {
+            return false;
+        }
+
+        if (tag->size < clut_offset + SAFE_FIXED_SIZE(CLUT_Layout)) {
+            return false;
+        }
+        const CLUT_Layout* clut = (const CLUT_Layout*)(tag->buf + clut_offset);
+
+        if (clut->grid_byte_width[0] == 1) {
+            a2b->grid_8  = clut->variable;
+            a2b->grid_16 = nullptr;
+        } else if (clut->grid_byte_width[0] == 2) {
+            a2b->grid_8  = nullptr;
+            a2b->grid_16 = clut->variable;
+        } else {
+            return false;
+        }
+
+        uint64_t grid_size = a2b->output_channels * clut->grid_byte_width[0];  // the payload
+        for (uint32_t i = 0; i < a2b->input_channels; ++i) {
+            a2b->grid_points[i] = clut->grid_points[i];
+            // The grid only makes sense with at least two points along each axis
+            if (a2b->grid_points[i] < 2) {
+                return false;
+            }
+            grid_size *= a2b->grid_points[i];
+        }
+        if (tag->size < clut_offset + SAFE_FIXED_SIZE(CLUT_Layout) + grid_size) {
+            return false;
+        }
+    } else {
+        if (0 != clut_offset) {
+            return false;
+        }
+
+        // If there is no CLUT, the number of input and output channels must match
+        if (a2b->input_channels != a2b->output_channels) {
+            return false;
+        }
+
+        // Zero out the number of input channels to signal that we're skipping this stage
+        a2b->input_channels = 0;
+    }
+
+    return true;
+}
+
+// Exactly the same as read_tag_mab(), except where there are comments.
+// TODO: refactor the two to eliminate common code?
+static bool read_tag_mba(const skcms_ICCTag* tag, skcms_B2A* b2a, bool pcs_is_xyz) {
+    if (tag->size < SAFE_SIZEOF(mAB_or_mBA_Layout)) {
+        return false;
+    }
+
+    const mAB_or_mBA_Layout* mBATag = (const mAB_or_mBA_Layout*)tag->buf;
+
+    b2a->input_channels  = mBATag->input_channels[0];
+    b2a->output_channels = mBATag->output_channels[0];
+
+    // Require exactly 3 inputs (XYZ) and 3 (RGB) or 4 (CMYK) outputs.
+    if (b2a->input_channels != ARRAY_COUNT(b2a->input_curves)) {
+        return false;
+    }
+    if (b2a->output_channels < 3 || b2a->output_channels > ARRAY_COUNT(b2a->output_curves)) {
+        return false;
+    }
+
+    uint32_t b_curve_offset = read_big_u32(mBATag->b_curve_offset);
+    uint32_t matrix_offset  = read_big_u32(mBATag->matrix_offset);
+    uint32_t m_curve_offset = read_big_u32(mBATag->m_curve_offset);
+    uint32_t clut_offset    = read_big_u32(mBATag->clut_offset);
+    uint32_t a_curve_offset = read_big_u32(mBATag->a_curve_offset);
+
+    if (0 == b_curve_offset) {
+        return false;
+    }
+
+    // "B" curves are our inputs, not outputs.
+    if (!read_curves(tag->buf, tag->size, b_curve_offset, b2a->input_channels,
+                     b2a->input_curves)) {
+        return false;
+    }
+
+    if (0 != m_curve_offset) {
+        if (0 == matrix_offset) {
+            return false;
+        }
+        // Matrix channels is tied to input_channels (3), not output_channels.
+        b2a->matrix_channels = b2a->input_channels;
+
+        if (!read_curves(tag->buf, tag->size, m_curve_offset, b2a->matrix_channels,
+                         b2a->matrix_curves)) {
+            return false;
+        }
+
+        if (tag->size < matrix_offset + 12 * SAFE_SIZEOF(uint32_t)) {
+            return false;
+        }
+        float encoding_factor = pcs_is_xyz ? (32768 / 65535.0f) : 1.0f;  // TODO: understand
+        const uint8_t* mtx_buf = tag->buf + matrix_offset;
+        b2a->matrix.vals[0][0] = encoding_factor * read_big_fixed(mtx_buf +  0);
+        b2a->matrix.vals[0][1] = encoding_factor * read_big_fixed(mtx_buf +  4);
+        b2a->matrix.vals[0][2] = encoding_factor * read_big_fixed(mtx_buf +  8);
+        b2a->matrix.vals[1][0] = encoding_factor * read_big_fixed(mtx_buf + 12);
+        b2a->matrix.vals[1][1] = encoding_factor * read_big_fixed(mtx_buf + 16);
+        b2a->matrix.vals[1][2] = encoding_factor * read_big_fixed(mtx_buf + 20);
+        b2a->matrix.vals[2][0] = encoding_factor * read_big_fixed(mtx_buf + 24);
+        b2a->matrix.vals[2][1] = encoding_factor * read_big_fixed(mtx_buf + 28);
+        b2a->matrix.vals[2][2] = encoding_factor * read_big_fixed(mtx_buf + 32);
+        b2a->matrix.vals[0][3] = encoding_factor * read_big_fixed(mtx_buf + 36);
+        b2a->matrix.vals[1][3] = encoding_factor * read_big_fixed(mtx_buf + 40);
+        b2a->matrix.vals[2][3] = encoding_factor * read_big_fixed(mtx_buf + 44);
+    } else {
+        if (0 != matrix_offset) {
+            return false;
+        }
+        b2a->matrix_channels = 0;
+    }
+
+    if (0 != a_curve_offset) {
+        if (0 == clut_offset) {
+            return false;
+        }
+
+        // "A" curves are our output, not input.
+        if (!read_curves(tag->buf, tag->size, a_curve_offset, b2a->output_channels,
+                         b2a->output_curves)) {
+            return false;
+        }
+
+        if (tag->size < clut_offset + SAFE_FIXED_SIZE(CLUT_Layout)) {
+            return false;
+        }
+        const CLUT_Layout* clut = (const CLUT_Layout*)(tag->buf + clut_offset);
+
+        if (clut->grid_byte_width[0] == 1) {
+            b2a->grid_8  = clut->variable;
+            b2a->grid_16 = nullptr;
+        } else if (clut->grid_byte_width[0] == 2) {
+            b2a->grid_8  = nullptr;
+            b2a->grid_16 = clut->variable;
+        } else {
+            return false;
+        }
+
+        uint64_t grid_size = b2a->output_channels * clut->grid_byte_width[0];
+        for (uint32_t i = 0; i < b2a->input_channels; ++i) {
+            b2a->grid_points[i] = clut->grid_points[i];
+            if (b2a->grid_points[i] < 2) {
+                return false;
+            }
+            grid_size *= b2a->grid_points[i];
+        }
+        if (tag->size < clut_offset + SAFE_FIXED_SIZE(CLUT_Layout) + grid_size) {
+            return false;
+        }
+    } else {
+        if (0 != clut_offset) {
+            return false;
+        }
+
+        if (b2a->input_channels != b2a->output_channels) {
+            return false;
+        }
+
+        // Zero out *output* channels to skip this stage.
+        b2a->output_channels = 0;
+    }
+    return true;
+}
+
+// If you pass f, we'll fit a possibly-non-zero value for *f.
+// If you pass nullptr, we'll assume you want *f to be treated as zero.
+static int fit_linear(const skcms_Curve* curve, int N, float tol,
+                      float* c, float* d, float* f = nullptr) {
+    assert(N > 1);
+    // We iteratively fit the first points to the TF's linear piece.
+    // We want the cx + f line to pass through the first and last points we fit exactly.
+    //
+    // As we walk along the points we find the minimum and maximum slope of the line before the
+    // error would exceed our tolerance.  We stop when the range [slope_min, slope_max] becomes
+    // emtpy, when we definitely can't add any more points.
+    //
+    // Some points' error intervals may intersect the running interval but not lie fully
+    // within it.  So we keep track of the last point we saw that is a valid end point candidate,
+    // and once the search is done, back up to build the line through *that* point.
+    const float dx = 1.0f / static_cast<float>(N - 1);
+
+    int lin_points = 1;
+
+    float f_zero = 0.0f;
+    if (f) {
+        *f = eval_curve(curve, 0);
+    } else {
+        f = &f_zero;
+    }
+
+
+    float slope_min = -INFINITY_;
+    float slope_max = +INFINITY_;
+    for (int i = 1; i < N; ++i) {
+        float x = static_cast<float>(i) * dx;
+        float y = eval_curve(curve, x);
+
+        float slope_max_i = (y + tol - *f) / x,
+              slope_min_i = (y - tol - *f) / x;
+        if (slope_max_i < slope_min || slope_max < slope_min_i) {
+            // Slope intervals would no longer overlap.
+            break;
+        }
+        slope_max = fminf_(slope_max, slope_max_i);
+        slope_min = fmaxf_(slope_min, slope_min_i);
+
+        float cur_slope = (y - *f) / x;
+        if (slope_min <= cur_slope && cur_slope <= slope_max) {
+            lin_points = i + 1;
+            *c = cur_slope;
+        }
+    }
+
+    // Set D to the last point that met our tolerance.
+    *d = static_cast<float>(lin_points - 1) * dx;
+    return lin_points;
+}
+
+// If this skcms_Curve holds an identity table, rewrite it as an identity skcms_TransferFunction.
+static void canonicalize_identity(skcms_Curve* curve) {
+    if (curve->table_entries && curve->table_entries <= (uint32_t)INT_MAX) {
+        int N = (int)curve->table_entries;
+
+        float c = 0.0f, d = 0.0f, f = 0.0f;
+        if (N == fit_linear(curve, N, 1.0f/static_cast<float>(2*N), &c,&d,&f)
+            && c == 1.0f
+            && f == 0.0f) {
+            curve->table_entries = 0;
+            curve->table_8       = nullptr;
+            curve->table_16      = nullptr;
+            curve->parametric    = skcms_TransferFunction{1,1,0,0,0,0,0};
+        }
+    }
+}
+
+static bool read_a2b(const skcms_ICCTag* tag, skcms_A2B* a2b, bool pcs_is_xyz) {
+    bool ok = false;
+    if (tag->type == skcms_Signature_mft1) { ok = read_tag_mft1(tag, a2b); }
+    if (tag->type == skcms_Signature_mft2) { ok = read_tag_mft2(tag, a2b); }
+    if (tag->type == skcms_Signature_mAB ) { ok = read_tag_mab(tag, a2b, pcs_is_xyz); }
+    if (!ok) {
+        return false;
+    }
+
+    if (a2b->input_channels > 0) { canonicalize_identity(a2b->input_curves + 0); }
+    if (a2b->input_channels > 1) { canonicalize_identity(a2b->input_curves + 1); }
+    if (a2b->input_channels > 2) { canonicalize_identity(a2b->input_curves + 2); }
+    if (a2b->input_channels > 3) { canonicalize_identity(a2b->input_curves + 3); }
+
+    if (a2b->matrix_channels > 0) { canonicalize_identity(a2b->matrix_curves + 0); }
+    if (a2b->matrix_channels > 1) { canonicalize_identity(a2b->matrix_curves + 1); }
+    if (a2b->matrix_channels > 2) { canonicalize_identity(a2b->matrix_curves + 2); }
+
+    if (a2b->output_channels > 0) { canonicalize_identity(a2b->output_curves + 0); }
+    if (a2b->output_channels > 1) { canonicalize_identity(a2b->output_curves + 1); }
+    if (a2b->output_channels > 2) { canonicalize_identity(a2b->output_curves + 2); }
+
+    return true;
+}
+
+static bool read_b2a(const skcms_ICCTag* tag, skcms_B2A* b2a, bool pcs_is_xyz) {
+    bool ok = false;
+    if (tag->type == skcms_Signature_mft1) { ok = read_tag_mft1(tag, b2a); }
+    if (tag->type == skcms_Signature_mft2) { ok = read_tag_mft2(tag, b2a); }
+    if (tag->type == skcms_Signature_mBA ) { ok = read_tag_mba(tag, b2a, pcs_is_xyz); }
+    if (!ok) {
+        return false;
+    }
+
+    if (b2a->input_channels > 0) { canonicalize_identity(b2a->input_curves + 0); }
+    if (b2a->input_channels > 1) { canonicalize_identity(b2a->input_curves + 1); }
+    if (b2a->input_channels > 2) { canonicalize_identity(b2a->input_curves + 2); }
+
+    if (b2a->matrix_channels > 0) { canonicalize_identity(b2a->matrix_curves + 0); }
+    if (b2a->matrix_channels > 1) { canonicalize_identity(b2a->matrix_curves + 1); }
+    if (b2a->matrix_channels > 2) { canonicalize_identity(b2a->matrix_curves + 2); }
+
+    if (b2a->output_channels > 0) { canonicalize_identity(b2a->output_curves + 0); }
+    if (b2a->output_channels > 1) { canonicalize_identity(b2a->output_curves + 1); }
+    if (b2a->output_channels > 2) { canonicalize_identity(b2a->output_curves + 2); }
+    if (b2a->output_channels > 3) { canonicalize_identity(b2a->output_curves + 3); }
+
+    return true;
+}
+
+typedef struct {
+    uint8_t type                     [4];
+    uint8_t reserved                 [4];
+    uint8_t color_primaries          [1];
+    uint8_t transfer_characteristics [1];
+    uint8_t matrix_coefficients      [1];
+    uint8_t video_full_range_flag    [1];
+} CICP_Layout;
+
+static bool read_cicp(const skcms_ICCTag* tag, skcms_CICP* cicp) {
+    if (tag->type != skcms_Signature_CICP || tag->size < SAFE_SIZEOF(CICP_Layout)) {
+        return false;
+    }
+
+    const CICP_Layout* cicpTag = (const CICP_Layout*)tag->buf;
+
+    cicp->color_primaries          = cicpTag->color_primaries[0];
+    cicp->transfer_characteristics = cicpTag->transfer_characteristics[0];
+    cicp->matrix_coefficients      = cicpTag->matrix_coefficients[0];
+    cicp->video_full_range_flag    = cicpTag->video_full_range_flag[0];
+    return true;
+}
+
+void skcms_GetTagByIndex(const skcms_ICCProfile* profile, uint32_t idx, skcms_ICCTag* tag) {
+    if (!profile || !profile->buffer || !tag) { return; }
+    if (idx > profile->tag_count) { return; }
+    const tag_Layout* tags = get_tag_table(profile);
+    tag->signature = read_big_u32(tags[idx].signature);
+    tag->size      = read_big_u32(tags[idx].size);
+    tag->buf       = read_big_u32(tags[idx].offset) + profile->buffer;
+    tag->type      = read_big_u32(tag->buf);
+}
+
+bool skcms_GetTagBySignature(const skcms_ICCProfile* profile, uint32_t sig, skcms_ICCTag* tag) {
+    if (!profile || !profile->buffer || !tag) { return false; }
+    const tag_Layout* tags = get_tag_table(profile);
+    for (uint32_t i = 0; i < profile->tag_count; ++i) {
+        if (read_big_u32(tags[i].signature) == sig) {
+            tag->signature = sig;
+            tag->size      = read_big_u32(tags[i].size);
+            tag->buf       = read_big_u32(tags[i].offset) + profile->buffer;
+            tag->type      = read_big_u32(tag->buf);
+            return true;
+        }
+    }
+    return false;
+}
+
+static bool usable_as_src(const skcms_ICCProfile* profile) {
+    return profile->has_A2B
+       || (profile->has_trc && profile->has_toXYZD50);
+}
+
+bool skcms_ParseWithA2BPriority(const void* buf, size_t len,
+                                const int priority[], const int priorities,
+                                skcms_ICCProfile* profile) {
+    assert(SAFE_SIZEOF(header_Layout) == 132);
+
+    if (!profile) {
+        return false;
+    }
+    memset(profile, 0, SAFE_SIZEOF(*profile));
+
+    if (len < SAFE_SIZEOF(header_Layout)) {
+        return false;
+    }
+
+    // Byte-swap all header fields
+    const header_Layout* header  = (const header_Layout*)buf;
+    profile->buffer              = (const uint8_t*)buf;
+    profile->size                = read_big_u32(header->size);
+    uint32_t version             = read_big_u32(header->version);
+    profile->data_color_space    = read_big_u32(header->data_color_space);
+    profile->pcs                 = read_big_u32(header->pcs);
+    uint32_t signature           = read_big_u32(header->signature);
+    float illuminant_X           = read_big_fixed(header->illuminant_X);
+    float illuminant_Y           = read_big_fixed(header->illuminant_Y);
+    float illuminant_Z           = read_big_fixed(header->illuminant_Z);
+    profile->tag_count           = read_big_u32(header->tag_count);
+
+    // Validate signature, size (smaller than buffer, large enough to hold tag table),
+    // and major version
+    uint64_t tag_table_size = profile->tag_count * SAFE_SIZEOF(tag_Layout);
+    if (signature != skcms_Signature_acsp ||
+        profile->size > len ||
+        profile->size < SAFE_SIZEOF(header_Layout) + tag_table_size ||
+        (version >> 24) > 4) {
+        return false;
+    }
+
+    // Validate that illuminant is D50 white
+    if (fabsf_(illuminant_X - 0.9642f) > 0.0100f ||
+        fabsf_(illuminant_Y - 1.0000f) > 0.0100f ||
+        fabsf_(illuminant_Z - 0.8249f) > 0.0100f) {
+        return false;
+    }
+
+    // Validate that all tag entries have sane offset + size
+    const tag_Layout* tags = get_tag_table(profile);
+    for (uint32_t i = 0; i < profile->tag_count; ++i) {
+        uint32_t tag_offset = read_big_u32(tags[i].offset);
+        uint32_t tag_size   = read_big_u32(tags[i].size);
+        uint64_t tag_end    = (uint64_t)tag_offset + (uint64_t)tag_size;
+        if (tag_size < 4 || tag_end > profile->size) {
+            return false;
+        }
+    }
+
+    if (profile->pcs != skcms_Signature_XYZ && profile->pcs != skcms_Signature_Lab) {
+        return false;
+    }
+
+    bool pcs_is_xyz = profile->pcs == skcms_Signature_XYZ;
+
+    // Pre-parse commonly used tags.
+    skcms_ICCTag kTRC;
+    if (profile->data_color_space == skcms_Signature_Gray &&
+        skcms_GetTagBySignature(profile, skcms_Signature_kTRC, &kTRC)) {
+        if (!read_curve(kTRC.buf, kTRC.size, &profile->trc[0], nullptr)) {
+            // Malformed tag
+            return false;
+        }
+        profile->trc[1] = profile->trc[0];
+        profile->trc[2] = profile->trc[0];
+        profile->has_trc = true;
+
+        if (pcs_is_xyz) {
+            profile->toXYZD50.vals[0][0] = illuminant_X;
+            profile->toXYZD50.vals[1][1] = illuminant_Y;
+            profile->toXYZD50.vals[2][2] = illuminant_Z;
+            profile->has_toXYZD50 = true;
+        }
+    } else {
+        skcms_ICCTag rTRC, gTRC, bTRC;
+        if (skcms_GetTagBySignature(profile, skcms_Signature_rTRC, &rTRC) &&
+            skcms_GetTagBySignature(profile, skcms_Signature_gTRC, &gTRC) &&
+            skcms_GetTagBySignature(profile, skcms_Signature_bTRC, &bTRC)) {
+            if (!read_curve(rTRC.buf, rTRC.size, &profile->trc[0], nullptr) ||
+                !read_curve(gTRC.buf, gTRC.size, &profile->trc[1], nullptr) ||
+                !read_curve(bTRC.buf, bTRC.size, &profile->trc[2], nullptr)) {
+                // Malformed TRC tags
+                return false;
+            }
+            profile->has_trc = true;
+        }
+
+        skcms_ICCTag rXYZ, gXYZ, bXYZ;
+        if (skcms_GetTagBySignature(profile, skcms_Signature_rXYZ, &rXYZ) &&
+            skcms_GetTagBySignature(profile, skcms_Signature_gXYZ, &gXYZ) &&
+            skcms_GetTagBySignature(profile, skcms_Signature_bXYZ, &bXYZ)) {
+            if (!read_to_XYZD50(&rXYZ, &gXYZ, &bXYZ, &profile->toXYZD50)) {
+                // Malformed XYZ tags
+                return false;
+            }
+            profile->has_toXYZD50 = true;
+        }
+    }
+
+    for (int i = 0; i < priorities; i++) {
+        // enum { perceptual, relative_colormetric, saturation }
+        if (priority[i] < 0 || priority[i] > 2) {
+            return false;
+        }
+        uint32_t sig = skcms_Signature_A2B0 + static_cast<uint32_t>(priority[i]);
+        skcms_ICCTag tag;
+        if (skcms_GetTagBySignature(profile, sig, &tag)) {
+            if (!read_a2b(&tag, &profile->A2B, pcs_is_xyz)) {
+                // Malformed A2B tag
+                return false;
+            }
+            profile->has_A2B = true;
+            break;
+        }
+    }
+
+    for (int i = 0; i < priorities; i++) {
+        // enum { perceptual, relative_colormetric, saturation }
+        if (priority[i] < 0 || priority[i] > 2) {
+            return false;
+        }
+        uint32_t sig = skcms_Signature_B2A0 + static_cast<uint32_t>(priority[i]);
+        skcms_ICCTag tag;
+        if (skcms_GetTagBySignature(profile, sig, &tag)) {
+            if (!read_b2a(&tag, &profile->B2A, pcs_is_xyz)) {
+                // Malformed B2A tag
+                return false;
+            }
+            profile->has_B2A = true;
+            break;
+        }
+    }
+
+    skcms_ICCTag cicp_tag;
+    if (skcms_GetTagBySignature(profile, skcms_Signature_CICP, &cicp_tag)) {
+        if (!read_cicp(&cicp_tag, &profile->CICP)) {
+            // Malformed CICP tag
+            return false;
+        }
+        profile->has_CICP = true;
+    }
+
+    return usable_as_src(profile);
+}
+
+
+const skcms_ICCProfile* skcms_sRGB_profile() {
+    static const skcms_ICCProfile sRGB_profile = {
+        nullptr,               // buffer, moot here
+
+        0,                     // size, moot here
+        skcms_Signature_RGB,   // data_color_space
+        skcms_Signature_XYZ,   // pcs
+        0,                     // tag count, moot here
+
+        // We choose to represent sRGB with its canonical transfer function,
+        // and with its canonical XYZD50 gamut matrix.
+        true,  // has_trc, followed by the 3 trc curves
+        {
+            {{0, {2.4f, (float)(1/1.055), (float)(0.055/1.055), (float)(1/12.92), 0.04045f, 0, 0}}},
+            {{0, {2.4f, (float)(1/1.055), (float)(0.055/1.055), (float)(1/12.92), 0.04045f, 0, 0}}},
+            {{0, {2.4f, (float)(1/1.055), (float)(0.055/1.055), (float)(1/12.92), 0.04045f, 0, 0}}},
+        },
+
+        true,  // has_toXYZD50, followed by 3x3 toXYZD50 matrix
+        {{
+            { 0.436065674f, 0.385147095f, 0.143066406f },
+            { 0.222488403f, 0.716873169f, 0.060607910f },
+            { 0.013916016f, 0.097076416f, 0.714096069f },
+        }},
+
+        false, // has_A2B, followed by A2B itself, which we don't care about.
+        {
+            0,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+            {0,0,0,0},
+            nullptr,
+            nullptr,
+
+            0,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+            {{
+                { 0,0,0,0 },
+                { 0,0,0,0 },
+                { 0,0,0,0 },
+            }},
+
+            0,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+        },
+
+        false, // has_B2A, followed by B2A itself, which we also don't care about.
+        {
+            0,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+
+            0,
+            {{
+                { 0,0,0,0 },
+                { 0,0,0,0 },
+                { 0,0,0,0 },
+            }},
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+
+            0,
+            {0,0,0,0},
+            nullptr,
+            nullptr,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+        },
+
+        false, // has_CICP, followed by cicp itself which we don't care about.
+        { 0, 0, 0, 0 },
+    };
+    return &sRGB_profile;
+}
+
+const skcms_ICCProfile* skcms_XYZD50_profile() {
+    // Just like sRGB above, but with identity transfer functions and toXYZD50 matrix.
+    static const skcms_ICCProfile XYZD50_profile = {
+        nullptr,               // buffer, moot here
+
+        0,                     // size, moot here
+        skcms_Signature_RGB,   // data_color_space
+        skcms_Signature_XYZ,   // pcs
+        0,                     // tag count, moot here
+
+        true,  // has_trc, followed by the 3 trc curves
+        {
+            {{0, {1,1, 0,0,0,0,0}}},
+            {{0, {1,1, 0,0,0,0,0}}},
+            {{0, {1,1, 0,0,0,0,0}}},
+        },
+
+        true,  // has_toXYZD50, followed by 3x3 toXYZD50 matrix
+        {{
+            { 1,0,0 },
+            { 0,1,0 },
+            { 0,0,1 },
+        }},
+
+        false, // has_A2B, followed by A2B itself, which we don't care about.
+        {
+            0,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+            {0,0,0,0},
+            nullptr,
+            nullptr,
+
+            0,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+            {{
+                { 0,0,0,0 },
+                { 0,0,0,0 },
+                { 0,0,0,0 },
+            }},
+
+            0,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+        },
+
+        false, // has_B2A, followed by B2A itself, which we also don't care about.
+        {
+            0,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+
+            0,
+            {{
+                { 0,0,0,0 },
+                { 0,0,0,0 },
+                { 0,0,0,0 },
+            }},
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+
+            0,
+            {0,0,0,0},
+            nullptr,
+            nullptr,
+            {
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+                {{0, {0,0, 0,0,0,0,0}}},
+            },
+        },
+
+        false, // has_CICP, followed by cicp itself which we don't care about.
+        { 0, 0, 0, 0 },
+    };
+
+    return &XYZD50_profile;
+}
+
+const skcms_TransferFunction* skcms_sRGB_TransferFunction() {
+    return &skcms_sRGB_profile()->trc[0].parametric;
+}
+
+const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction() {
+    static const skcms_TransferFunction sRGB_inv =
+        {0.416666657f, 1.137283325f, -0.0f, 12.920000076f, 0.003130805f, -0.054969788f, -0.0f};
+    return &sRGB_inv;
+}
+
+const skcms_TransferFunction* skcms_Identity_TransferFunction() {
+    static const skcms_TransferFunction identity = {1,1,0,0,0,0,0};
+    return &identity;
+}
+
+const uint8_t skcms_252_random_bytes[] = {
+    8, 179, 128, 204, 253, 38, 134, 184, 68, 102, 32, 138, 99, 39, 169, 215,
+    119, 26, 3, 223, 95, 239, 52, 132, 114, 74, 81, 234, 97, 116, 244, 205, 30,
+    154, 173, 12, 51, 159, 122, 153, 61, 226, 236, 178, 229, 55, 181, 220, 191,
+    194, 160, 126, 168, 82, 131, 18, 180, 245, 163, 22, 246, 69, 235, 252, 57,
+    108, 14, 6, 152, 240, 255, 171, 242, 20, 227, 177, 238, 96, 85, 16, 211,
+    70, 200, 149, 155, 146, 127, 145, 100, 151, 109, 19, 165, 208, 195, 164,
+    137, 254, 182, 248, 64, 201, 45, 209, 5, 147, 207, 210, 113, 162, 83, 225,
+    9, 31, 15, 231, 115, 37, 58, 53, 24, 49, 197, 56, 120, 172, 48, 21, 214,
+    129, 111, 11, 50, 187, 196, 34, 60, 103, 71, 144, 47, 203, 77, 80, 232,
+    140, 222, 250, 206, 166, 247, 139, 249, 221, 72, 106, 27, 199, 117, 54,
+    219, 135, 118, 40, 79, 41, 251, 46, 93, 212, 92, 233, 148, 28, 121, 63,
+    123, 158, 105, 59, 29, 42, 143, 23, 0, 107, 176, 87, 104, 183, 156, 193,
+    189, 90, 188, 65, 190, 17, 198, 7, 186, 161, 1, 124, 78, 125, 170, 133,
+    174, 218, 67, 157, 75, 101, 89, 217, 62, 33, 141, 228, 25, 35, 91, 230, 4,
+    2, 13, 73, 86, 167, 237, 84, 243, 44, 185, 66, 130, 110, 150, 142, 216, 88,
+    112, 36, 224, 136, 202, 76, 94, 98, 175, 213
+};
+
+bool skcms_ApproximatelyEqualProfiles(const skcms_ICCProfile* A, const skcms_ICCProfile* B) {
+    // Test for exactly equal profiles first.
+    if (A == B || 0 == memcmp(A,B, sizeof(skcms_ICCProfile))) {
+        return true;
+    }
+
+    // For now this is the essentially the same strategy we use in test_only.c
+    // for our skcms_Transform() smoke tests:
+    //    1) transform A to XYZD50
+    //    2) transform B to XYZD50
+    //    3) return true if they're similar enough
+    // Our current criterion in 3) is maximum 1 bit error per XYZD50 byte.
+
+    // skcms_252_random_bytes are 252 of a random shuffle of all possible bytes.
+    // 252 is evenly divisible by 3 and 4.  Only 192, 10, 241, and 43 are missing.
+
+    // We want to allow otherwise equivalent profiles tagged as grayscale and RGB
+    // to be treated as equal.  But CMYK profiles are a totally different ballgame.
+    const auto CMYK = skcms_Signature_CMYK;
+    if ((A->data_color_space == CMYK) != (B->data_color_space == CMYK)) {
+        return false;
+    }
+
+    // Interpret as RGB_888 if data color space is RGB or GRAY, RGBA_8888 if CMYK.
+    // TODO: working with RGBA_8888 either way is probably fastest.
+    skcms_PixelFormat fmt = skcms_PixelFormat_RGB_888;
+    size_t npixels = 84;
+    if (A->data_color_space == skcms_Signature_CMYK) {
+        fmt = skcms_PixelFormat_RGBA_8888;
+        npixels = 63;
+    }
+
+    // TODO: if A or B is a known profile (skcms_sRGB_profile, skcms_XYZD50_profile),
+    // use pre-canned results and skip that skcms_Transform() call?
+    uint8_t dstA[252],
+            dstB[252];
+    if (!skcms_Transform(
+                skcms_252_random_bytes,     fmt, skcms_AlphaFormat_Unpremul, A,
+                dstA, skcms_PixelFormat_RGB_888, skcms_AlphaFormat_Unpremul, skcms_XYZD50_profile(),
+                npixels)) {
+        return false;
+    }
+    if (!skcms_Transform(
+                skcms_252_random_bytes,     fmt, skcms_AlphaFormat_Unpremul, B,
+                dstB, skcms_PixelFormat_RGB_888, skcms_AlphaFormat_Unpremul, skcms_XYZD50_profile(),
+                npixels)) {
+        return false;
+    }
+
+    // TODO: make sure this final check has reasonable codegen.
+    for (size_t i = 0; i < 252; i++) {
+        if (abs((int)dstA[i] - (int)dstB[i]) > 1) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool skcms_TRCs_AreApproximateInverse(const skcms_ICCProfile* profile,
+                                      const skcms_TransferFunction* inv_tf) {
+    if (!profile || !profile->has_trc) {
+        return false;
+    }
+
+    return skcms_AreApproximateInverses(&profile->trc[0], inv_tf) &&
+           skcms_AreApproximateInverses(&profile->trc[1], inv_tf) &&
+           skcms_AreApproximateInverses(&profile->trc[2], inv_tf);
+}
+
+static bool is_zero_to_one(float x) {
+    return 0 <= x && x <= 1;
+}
+
+typedef struct { float vals[3]; } skcms_Vector3;
+
+static skcms_Vector3 mv_mul(const skcms_Matrix3x3* m, const skcms_Vector3* v) {
+    skcms_Vector3 dst = {{0,0,0}};
+    for (int row = 0; row < 3; ++row) {
+        dst.vals[row] = m->vals[row][0] * v->vals[0]
+                      + m->vals[row][1] * v->vals[1]
+                      + m->vals[row][2] * v->vals[2];
+    }
+    return dst;
+}
+
+bool skcms_AdaptToXYZD50(float wx, float wy,
+                         skcms_Matrix3x3* toXYZD50) {
+    if (!is_zero_to_one(wx) || !is_zero_to_one(wy) ||
+        !toXYZD50) {
+        return false;
+    }
+
+    // Assumes that Y is 1.0f.
+    skcms_Vector3 wXYZ = { { wx / wy, 1, (1 - wx - wy) / wy } };
+
+    // Now convert toXYZ matrix to toXYZD50.
+    skcms_Vector3 wXYZD50 = { { 0.96422f, 1.0f, 0.82521f } };
+
+    // Calculate the chromatic adaptation matrix.  We will use the Bradford method, thus
+    // the matrices below.  The Bradford method is used by Adobe and is widely considered
+    // to be the best.
+    skcms_Matrix3x3 xyz_to_lms = {{
+        {  0.8951f,  0.2664f, -0.1614f },
+        { -0.7502f,  1.7135f,  0.0367f },
+        {  0.0389f, -0.0685f,  1.0296f },
+    }};
+    skcms_Matrix3x3 lms_to_xyz = {{
+        {  0.9869929f, -0.1470543f, 0.1599627f },
+        {  0.4323053f,  0.5183603f, 0.0492912f },
+        { -0.0085287f,  0.0400428f, 0.9684867f },
+    }};
+
+    skcms_Vector3 srcCone = mv_mul(&xyz_to_lms, &wXYZ);
+    skcms_Vector3 dstCone = mv_mul(&xyz_to_lms, &wXYZD50);
+
+    *toXYZD50 = {{
+        { dstCone.vals[0] / srcCone.vals[0], 0, 0 },
+        { 0, dstCone.vals[1] / srcCone.vals[1], 0 },
+        { 0, 0, dstCone.vals[2] / srcCone.vals[2] },
+    }};
+    *toXYZD50 = skcms_Matrix3x3_concat(toXYZD50, &xyz_to_lms);
+    *toXYZD50 = skcms_Matrix3x3_concat(&lms_to_xyz, toXYZD50);
+
+    return true;
+}
+
+bool skcms_PrimariesToXYZD50(float rx, float ry,
+                             float gx, float gy,
+                             float bx, float by,
+                             float wx, float wy,
+                             skcms_Matrix3x3* toXYZD50) {
+    if (!is_zero_to_one(rx) || !is_zero_to_one(ry) ||
+        !is_zero_to_one(gx) || !is_zero_to_one(gy) ||
+        !is_zero_to_one(bx) || !is_zero_to_one(by) ||
+        !is_zero_to_one(wx) || !is_zero_to_one(wy) ||
+        !toXYZD50) {
+        return false;
+    }
+
+    // First, we need to convert xy values (primaries) to XYZ.
+    skcms_Matrix3x3 primaries = {{
+        { rx, gx, bx },
+        { ry, gy, by },
+        { 1 - rx - ry, 1 - gx - gy, 1 - bx - by },
+    }};
+    skcms_Matrix3x3 primaries_inv;
+    if (!skcms_Matrix3x3_invert(&primaries, &primaries_inv)) {
+        return false;
+    }
+
+    // Assumes that Y is 1.0f.
+    skcms_Vector3 wXYZ = { { wx / wy, 1, (1 - wx - wy) / wy } };
+    skcms_Vector3 XYZ = mv_mul(&primaries_inv, &wXYZ);
+
+    skcms_Matrix3x3 toXYZ = {{
+        { XYZ.vals[0],           0,           0 },
+        {           0, XYZ.vals[1],           0 },
+        {           0,           0, XYZ.vals[2] },
+    }};
+    toXYZ = skcms_Matrix3x3_concat(&primaries, &toXYZ);
+
+    skcms_Matrix3x3 DXtoD50;
+    if (!skcms_AdaptToXYZD50(wx, wy, &DXtoD50)) {
+        return false;
+    }
+
+    *toXYZD50 = skcms_Matrix3x3_concat(&DXtoD50, &toXYZ);
+    return true;
+}
+
+
+bool skcms_Matrix3x3_invert(const skcms_Matrix3x3* src, skcms_Matrix3x3* dst) {
+    double a00 = src->vals[0][0],
+           a01 = src->vals[1][0],
+           a02 = src->vals[2][0],
+           a10 = src->vals[0][1],
+           a11 = src->vals[1][1],
+           a12 = src->vals[2][1],
+           a20 = src->vals[0][2],
+           a21 = src->vals[1][2],
+           a22 = src->vals[2][2];
+
+    double b0 = a00*a11 - a01*a10,
+           b1 = a00*a12 - a02*a10,
+           b2 = a01*a12 - a02*a11,
+           b3 = a20,
+           b4 = a21,
+           b5 = a22;
+
+    double determinant = b0*b5
+                       - b1*b4
+                       + b2*b3;
+
+    if (determinant == 0) {
+        return false;
+    }
+
+    double invdet = 1.0 / determinant;
+    if (invdet > +FLT_MAX || invdet < -FLT_MAX || !isfinitef_((float)invdet)) {
+        return false;
+    }
+
+    b0 *= invdet;
+    b1 *= invdet;
+    b2 *= invdet;
+    b3 *= invdet;
+    b4 *= invdet;
+    b5 *= invdet;
+
+    dst->vals[0][0] = (float)( a11*b5 - a12*b4 );
+    dst->vals[1][0] = (float)( a02*b4 - a01*b5 );
+    dst->vals[2][0] = (float)(        +     b2 );
+    dst->vals[0][1] = (float)( a12*b3 - a10*b5 );
+    dst->vals[1][1] = (float)( a00*b5 - a02*b3 );
+    dst->vals[2][1] = (float)(        -     b1 );
+    dst->vals[0][2] = (float)( a10*b4 - a11*b3 );
+    dst->vals[1][2] = (float)( a01*b3 - a00*b4 );
+    dst->vals[2][2] = (float)(        +     b0 );
+
+    for (int r = 0; r < 3; ++r)
+    for (int c = 0; c < 3; ++c) {
+        if (!isfinitef_(dst->vals[r][c])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+skcms_Matrix3x3 skcms_Matrix3x3_concat(const skcms_Matrix3x3* A, const skcms_Matrix3x3* B) {
+    skcms_Matrix3x3 m = { { { 0,0,0 },{ 0,0,0 },{ 0,0,0 } } };
+    for (int r = 0; r < 3; r++)
+        for (int c = 0; c < 3; c++) {
+            m.vals[r][c] = A->vals[r][0] * B->vals[0][c]
+                         + A->vals[r][1] * B->vals[1][c]
+                         + A->vals[r][2] * B->vals[2][c];
+        }
+    return m;
+}
+
+#if defined(__clang__)
+    [[clang::no_sanitize("float-divide-by-zero")]]  // Checked for by classify() on the way out.
+#endif
+bool skcms_TransferFunction_invert(const skcms_TransferFunction* src, skcms_TransferFunction* dst) {
+    TF_PQish  pq;
+    TF_HLGish hlg;
+    switch (classify(*src, &pq, &hlg)) {
+        case skcms_TFType_Invalid: return false;
+        case skcms_TFType_sRGBish: break;  // handled below
+
+        case skcms_TFType_PQish:
+            *dst = { TFKind_marker(skcms_TFType_PQish), -pq.A,  pq.D, 1.0f/pq.F
+                                                      ,  pq.B, -pq.E, 1.0f/pq.C};
+            return true;
+
+        case skcms_TFType_HLGish:
+            *dst = { TFKind_marker(skcms_TFType_HLGinvish), 1.0f/hlg.R, 1.0f/hlg.G
+                                                          , 1.0f/hlg.a, hlg.b, hlg.c
+                                                          , hlg.K_minus_1 };
+            return true;
+
+        case skcms_TFType_HLGinvish:
+            *dst = { TFKind_marker(skcms_TFType_HLGish), 1.0f/hlg.R, 1.0f/hlg.G
+                                                       , 1.0f/hlg.a, hlg.b, hlg.c
+                                                       , hlg.K_minus_1 };
+            return true;
+    }
+
+    assert (classify(*src) == skcms_TFType_sRGBish);
+
+    // We're inverting this function, solving for x in terms of y.
+    //   y = (cx + f)         x < d
+    //       (ax + b)^g + e   x ≥ d
+    // The inverse of this function can be expressed in the same piecewise form.
+    skcms_TransferFunction inv = {0,0,0,0,0,0,0};
+
+    // We'll start by finding the new threshold inv.d.
+    // In principle we should be able to find that by solving for y at x=d from either side.
+    // (If those two d values aren't the same, it's a discontinuous transfer function.)
+    float d_l =       src->c * src->d + src->f,
+          d_r = powf_(src->a * src->d + src->b, src->g) + src->e;
+    if (fabsf_(d_l - d_r) > 1/512.0f) {
+        return false;
+    }
+    inv.d = d_l;  // TODO(mtklein): better in practice to choose d_r?
+
+    // When d=0, the linear section collapses to a point.  We leave c,d,f all zero in that case.
+    if (inv.d > 0) {
+        // Inverting the linear section is pretty straightfoward:
+        //        y       = cx + f
+        //        y - f   = cx
+        //   (1/c)y - f/c = x
+        inv.c =    1.0f/src->c;
+        inv.f = -src->f/src->c;
+    }
+
+    // The interesting part is inverting the nonlinear section:
+    //         y                = (ax + b)^g + e.
+    //         y - e            = (ax + b)^g
+    //        (y - e)^1/g       =  ax + b
+    //        (y - e)^1/g - b   =  ax
+    //   (1/a)(y - e)^1/g - b/a =   x
+    //
+    // To make that fit our form, we need to move the (1/a) term inside the exponentiation:
+    //   let k = (1/a)^g
+    //   (1/a)( y -  e)^1/g - b/a = x
+    //        (ky - ke)^1/g - b/a = x
+
+    float k = powf_(src->a, -src->g);  // (1/a)^g == a^-g
+    inv.g = 1.0f / src->g;
+    inv.a = k;
+    inv.b = -k * src->e;
+    inv.e = -src->b / src->a;
+
+    // We need to enforce the same constraints here that we do when fitting a curve,
+    // a >= 0 and ad+b >= 0.  These constraints are checked by classify(), so they're true
+    // of the source function if we're here.
+
+    // Just like when fitting the curve, there's really no way to rescue a < 0.
+    if (inv.a < 0) {
+        return false;
+    }
+    // On the other hand we can rescue an ad+b that's gone slightly negative here.
+    if (inv.a * inv.d + inv.b < 0) {
+        inv.b = -inv.a * inv.d;
+    }
+
+    // That should usually make classify(inv) == sRGBish true, but there are a couple situations
+    // where we might still fail here, like non-finite parameter values.
+    if (classify(inv) != skcms_TFType_sRGBish) {
+        return false;
+    }
+
+    assert (inv.a >= 0);
+    assert (inv.a * inv.d + inv.b >= 0);
+
+    // Now in principle we're done.
+    // But to preserve the valuable invariant inv(src(1.0f)) == 1.0f, we'll tweak
+    // e or f of the inverse, depending on which segment contains src(1.0f).
+    float s = skcms_TransferFunction_eval(src, 1.0f);
+    if (!isfinitef_(s)) {
+        return false;
+    }
+
+    float sign = s < 0 ? -1.0f : 1.0f;
+    s *= sign;
+    if (s < inv.d) {
+        inv.f = 1.0f - sign * inv.c * s;
+    } else {
+        inv.e = 1.0f - sign * powf_(inv.a * s + inv.b, inv.g);
+    }
+
+    *dst = inv;
+    return classify(*dst) == skcms_TFType_sRGBish;
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+
+// From here below we're approximating an skcms_Curve with an skcms_TransferFunction{g,a,b,c,d,e,f}:
+//
+//   tf(x) =  cx + f          x < d
+//   tf(x) = (ax + b)^g + e   x ≥ d
+//
+// When fitting, we add the additional constraint that both pieces meet at d:
+//
+//   cd + f = (ad + b)^g + e
+//
+// Solving for e and folding it through gives an alternate formulation of the non-linear piece:
+//
+//   tf(x) =                           cx + f   x < d
+//   tf(x) = (ax + b)^g - (ad + b)^g + cd + f   x ≥ d
+//
+// Our overall strategy is then:
+//    For a couple tolerances,
+//       - fit_linear():    fit c,d,f iteratively to as many points as our tolerance allows
+//       - invert c,d,f
+//       - fit_nonlinear(): fit g,a,b using Gauss-Newton given those inverted c,d,f
+//                          (and by constraint, inverted e) to the inverse of the table.
+//    Return the parameters with least maximum error.
+//
+// To run Gauss-Newton to find g,a,b, we'll also need the gradient of the residuals
+// of round-trip f_inv(x), the inverse of the non-linear piece of f(x).
+//
+//    let y = Table(x)
+//    r(x) = x - f_inv(y)
+//
+//    ∂r/∂g = ln(ay + b)*(ay + b)^g
+//          - ln(ad + b)*(ad + b)^g
+//    ∂r/∂a = yg(ay + b)^(g-1)
+//          - dg(ad + b)^(g-1)
+//    ∂r/∂b =  g(ay + b)^(g-1)
+//          -  g(ad + b)^(g-1)
+
+// Return the residual of roundtripping skcms_Curve(x) through f_inv(y) with parameters P,
+// and fill out the gradient of the residual into dfdP.
+static float rg_nonlinear(float x,
+                          const skcms_Curve* curve,
+                          const skcms_TransferFunction* tf,
+                          float dfdP[3]) {
+    const float y = eval_curve(curve, x);
+
+    const float g = tf->g, a = tf->a, b = tf->b,
+                c = tf->c, d = tf->d, f = tf->f;
+
+    const float Y = fmaxf_(a*y + b, 0.0f),
+                D =        a*d + b;
+    assert (D >= 0);
+
+    // The gradient.
+    dfdP[0] = logf_(Y)*powf_(Y, g)
+            - logf_(D)*powf_(D, g);
+    dfdP[1] = y*g*powf_(Y, g-1)
+            - d*g*powf_(D, g-1);
+    dfdP[2] =   g*powf_(Y, g-1)
+            -   g*powf_(D, g-1);
+
+    // The residual.
+    const float f_inv = powf_(Y, g)
+                      - powf_(D, g)
+                      + c*d + f;
+    return x - f_inv;
+}
+
+static bool gauss_newton_step(const skcms_Curve* curve,
+                                    skcms_TransferFunction* tf,
+                              float x0, float dx, int N) {
+    // We'll sample x from the range [x0,x1] (both inclusive) N times with even spacing.
+    //
+    // Let P = [ tf->g, tf->a, tf->b ] (the three terms that we're adjusting).
+    //
+    // We want to do P' = P + (Jf^T Jf)^-1 Jf^T r(P),
+    //   where r(P) is the residual vector
+    //   and Jf is the Jacobian matrix of f(), ∂r/∂P.
+    //
+    // Let's review the shape of each of these expressions:
+    //   r(P)   is [N x 1], a column vector with one entry per value of x tested
+    //   Jf     is [N x 3], a matrix with an entry for each (x,P) pair
+    //   Jf^T   is [3 x N], the transpose of Jf
+    //
+    //   Jf^T Jf   is [3 x N] * [N x 3] == [3 x 3], a 3x3 matrix,
+    //                                              and so is its inverse (Jf^T Jf)^-1
+    //   Jf^T r(P) is [3 x N] * [N x 1] == [3 x 1], a column vector with the same shape as P
+    //
+    // Our implementation strategy to get to the final ∆P is
+    //   1) evaluate Jf^T Jf,   call that lhs
+    //   2) evaluate Jf^T r(P), call that rhs
+    //   3) invert lhs
+    //   4) multiply inverse lhs by rhs
+    //
+    // This is a friendly implementation strategy because we don't have to have any
+    // buffers that scale with N, and equally nice don't have to perform any matrix
+    // operations that are variable size.
+    //
+    // Other implementation strategies could trade this off, e.g. evaluating the
+    // pseudoinverse of Jf ( (Jf^T Jf)^-1 Jf^T ) directly, then multiplying that by
+    // the residuals.  That would probably require implementing singular value
+    // decomposition, and would create a [3 x N] matrix to be multiplied by the
+    // [N x 1] residual vector, but on the upside I think that'd eliminate the
+    // possibility of this gauss_newton_step() function ever failing.
+
+    // 0) start off with lhs and rhs safely zeroed.
+    skcms_Matrix3x3 lhs = {{ {0,0,0}, {0,0,0}, {0,0,0} }};
+    skcms_Vector3   rhs = {  {0,0,0} };
+
+    // 1,2) evaluate lhs and evaluate rhs
+    //   We want to evaluate Jf only once, but both lhs and rhs involve Jf^T,
+    //   so we'll have to update lhs and rhs at the same time.
+    for (int i = 0; i < N; i++) {
+        float x = x0 + static_cast<float>(i)*dx;
+
+        float dfdP[3] = {0,0,0};
+        float resid = rg_nonlinear(x,curve,tf, dfdP);
+
+        for (int r = 0; r < 3; r++) {
+            for (int c = 0; c < 3; c++) {
+                lhs.vals[r][c] += dfdP[r] * dfdP[c];
+            }
+            rhs.vals[r] += dfdP[r] * resid;
+        }
+    }
+
+    // If any of the 3 P parameters are unused, this matrix will be singular.
+    // Detect those cases and fix them up to indentity instead, so we can invert.
+    for (int k = 0; k < 3; k++) {
+        if (lhs.vals[0][k]==0 && lhs.vals[1][k]==0 && lhs.vals[2][k]==0 &&
+            lhs.vals[k][0]==0 && lhs.vals[k][1]==0 && lhs.vals[k][2]==0) {
+            lhs.vals[k][k] = 1;
+        }
+    }
+
+    // 3) invert lhs
+    skcms_Matrix3x3 lhs_inv;
+    if (!skcms_Matrix3x3_invert(&lhs, &lhs_inv)) {
+        return false;
+    }
+
+    // 4) multiply inverse lhs by rhs
+    skcms_Vector3 dP = mv_mul(&lhs_inv, &rhs);
+    tf->g += dP.vals[0];
+    tf->a += dP.vals[1];
+    tf->b += dP.vals[2];
+    return isfinitef_(tf->g) && isfinitef_(tf->a) && isfinitef_(tf->b);
+}
+
+static float max_roundtrip_error_checked(const skcms_Curve* curve,
+                                         const skcms_TransferFunction* tf_inv) {
+    skcms_TransferFunction tf;
+    if (!skcms_TransferFunction_invert(tf_inv, &tf) || skcms_TFType_sRGBish != classify(tf)) {
+        return INFINITY_;
+    }
+
+    skcms_TransferFunction tf_inv_again;
+    if (!skcms_TransferFunction_invert(&tf, &tf_inv_again)) {
+        return INFINITY_;
+    }
+
+    return skcms_MaxRoundtripError(curve, &tf_inv_again);
+}
+
+// Fit the points in [L,N) to the non-linear piece of tf, or return false if we can't.
+static bool fit_nonlinear(const skcms_Curve* curve, int L, int N, skcms_TransferFunction* tf) {
+    // This enforces a few constraints that are not modeled in gauss_newton_step()'s optimization.
+    auto fixup_tf = [tf]() {
+        // a must be non-negative. That ensures the function is monotonically increasing.
+        // We don't really know how to fix up a if it goes negative.
+        if (tf->a < 0) {
+            return false;
+        }
+        // ad+b must be non-negative. That ensures we don't end up with complex numbers in powf.
+        // We feel just barely not uneasy enough to tweak b so ad+b is zero in this case.
+        if (tf->a * tf->d + tf->b < 0) {
+            tf->b = -tf->a * tf->d;
+        }
+        assert (tf->a >= 0 &&
+                tf->a * tf->d + tf->b >= 0);
+
+        // cd+f must be ~= (ad+b)^g+e. That ensures the function is continuous. We keep e as a free
+        // parameter so we can guarantee this.
+        tf->e =   tf->c*tf->d + tf->f
+          - powf_(tf->a*tf->d + tf->b, tf->g);
+
+        return true;
+    };
+
+    if (!fixup_tf()) {
+        return false;
+    }
+
+    // No matter where we start, dx should always represent N even steps from 0 to 1.
+    const float dx = 1.0f / static_cast<float>(N-1);
+
+    skcms_TransferFunction best_tf = *tf;
+    float best_max_error = INFINITY_;
+
+    // Need this or several curves get worse... *sigh*
+    float init_error = max_roundtrip_error_checked(curve, tf);
+    if (init_error < best_max_error) {
+        best_max_error = init_error;
+        best_tf = *tf;
+    }
+
+    // As far as we can tell, 1 Gauss-Newton step won't converge, and 3 steps is no better than 2.
+    for (int j = 0; j < 8; j++) {
+        if (!gauss_newton_step(curve, tf, static_cast<float>(L)*dx, dx, N-L) || !fixup_tf()) {
+            *tf = best_tf;
+            return isfinitef_(best_max_error);
+        }
+
+        float max_error = max_roundtrip_error_checked(curve, tf);
+        if (max_error < best_max_error) {
+            best_max_error = max_error;
+            best_tf = *tf;
+        }
+    }
+
+    *tf = best_tf;
+    return isfinitef_(best_max_error);
+}
+
+bool skcms_ApproximateCurve(const skcms_Curve* curve,
+                            skcms_TransferFunction* approx,
+                            float* max_error) {
+    if (!curve || !approx || !max_error) {
+        return false;
+    }
+
+    if (curve->table_entries == 0) {
+        // No point approximating an skcms_TransferFunction with an skcms_TransferFunction!
+        return false;
+    }
+
+    if (curve->table_entries == 1 || curve->table_entries > (uint32_t)INT_MAX) {
+        // We need at least two points, and must put some reasonable cap on the maximum number.
+        return false;
+    }
+
+    int N = (int)curve->table_entries;
+    const float dx = 1.0f / static_cast<float>(N - 1);
+
+    *max_error = INFINITY_;
+    const float kTolerances[] = { 1.5f / 65535.0f, 1.0f / 512.0f };
+    for (int t = 0; t < ARRAY_COUNT(kTolerances); t++) {
+        skcms_TransferFunction tf,
+                               tf_inv;
+
+        // It's problematic to fit curves with non-zero f, so always force it to zero explicitly.
+        tf.f = 0.0f;
+        int L = fit_linear(curve, N, kTolerances[t], &tf.c, &tf.d);
+
+        if (L == N) {
+            // If the entire data set was linear, move the coefficients to the nonlinear portion
+            // with G == 1.  This lets use a canonical representation with d == 0.
+            tf.g = 1;
+            tf.a = tf.c;
+            tf.b = tf.f;
+            tf.c = tf.d = tf.e = tf.f = 0;
+        } else if (L == N - 1) {
+            // Degenerate case with only two points in the nonlinear segment. Solve directly.
+            tf.g = 1;
+            tf.a = (eval_curve(curve, static_cast<float>(N-1)*dx) -
+                    eval_curve(curve, static_cast<float>(N-2)*dx))
+                 / dx;
+            tf.b = eval_curve(curve, static_cast<float>(N-2)*dx)
+                 - tf.a * static_cast<float>(N-2)*dx;
+            tf.e = 0;
+        } else {
+            // Start by guessing a gamma-only curve through the midpoint.
+            int mid = (L + N) / 2;
+            float mid_x = static_cast<float>(mid) / static_cast<float>(N - 1);
+            float mid_y = eval_curve(curve, mid_x);
+            tf.g = log2f_(mid_y) / log2f_(mid_x);
+            tf.a = 1;
+            tf.b = 0;
+            tf.e =    tf.c*tf.d + tf.f
+              - powf_(tf.a*tf.d + tf.b, tf.g);
+
+
+            if (!skcms_TransferFunction_invert(&tf, &tf_inv) ||
+                !fit_nonlinear(curve, L,N, &tf_inv)) {
+                continue;
+            }
+
+            // We fit tf_inv, so calculate tf to keep in sync.
+            // fit_nonlinear() should guarantee invertibility.
+            if (!skcms_TransferFunction_invert(&tf_inv, &tf)) {
+                assert(false);
+                continue;
+            }
+        }
+
+        // We'd better have a sane, sRGB-ish TF by now.
+        // Other non-Bad TFs would be fine, but we know we've only ever tried to fit sRGBish;
+        // anything else is just some accident of math and the way we pun tf.g as a type flag.
+        // fit_nonlinear() should guarantee this, but the special cases may fail this test.
+        if (skcms_TFType_sRGBish != classify(tf)) {
+            continue;
+        }
+
+        // We find our error by roundtripping the table through tf_inv.
+        //
+        // (The most likely use case for this approximation is to be inverted and
+        // used as the transfer function for a destination color space.)
+        //
+        // We've kept tf and tf_inv in sync above, but we can't guarantee that tf is
+        // invertible, so re-verify that here (and use the new inverse for testing).
+        // fit_nonlinear() should guarantee this, but the special cases that don't use
+        // it may fail this test.
+        if (!skcms_TransferFunction_invert(&tf, &tf_inv)) {
+            continue;
+        }
+
+        float err = skcms_MaxRoundtripError(curve, &tf_inv);
+        if (*max_error > err) {
+            *max_error = err;
+            *approx    = tf;
+        }
+    }
+    return isfinitef_(*max_error);
+}
+
+// ~~~~ Impl. of skcms_Transform() ~~~~
+
+typedef enum {
+    Op_load_a8,
+    Op_load_g8,
+    Op_load_8888_palette8,
+    Op_load_4444,
+    Op_load_565,
+    Op_load_888,
+    Op_load_8888,
+    Op_load_1010102,
+    Op_load_101010x_XR,
+    Op_load_161616LE,
+    Op_load_16161616LE,
+    Op_load_161616BE,
+    Op_load_16161616BE,
+    Op_load_hhh,
+    Op_load_hhhh,
+    Op_load_fff,
+    Op_load_ffff,
+
+    Op_swap_rb,
+    Op_clamp,
+    Op_invert,
+    Op_force_opaque,
+    Op_premul,
+    Op_unpremul,
+    Op_matrix_3x3,
+    Op_matrix_3x4,
+
+    Op_lab_to_xyz,
+    Op_xyz_to_lab,
+
+    Op_tf_r,
+    Op_tf_g,
+    Op_tf_b,
+    Op_tf_a,
+
+    Op_pq_r,
+    Op_pq_g,
+    Op_pq_b,
+    Op_pq_a,
+
+    Op_hlg_r,
+    Op_hlg_g,
+    Op_hlg_b,
+    Op_hlg_a,
+
+    Op_hlginv_r,
+    Op_hlginv_g,
+    Op_hlginv_b,
+    Op_hlginv_a,
+
+    Op_table_r,
+    Op_table_g,
+    Op_table_b,
+    Op_table_a,
+
+    Op_clut_A2B,
+    Op_clut_B2A,
+
+    Op_store_a8,
+    Op_store_g8,
+    Op_store_4444,
+    Op_store_565,
+    Op_store_888,
+    Op_store_8888,
+    Op_store_1010102,
+    Op_store_161616LE,
+    Op_store_16161616LE,
+    Op_store_161616BE,
+    Op_store_16161616BE,
+    Op_store_101010x_XR,
+    Op_store_hhh,
+    Op_store_hhhh,
+    Op_store_fff,
+    Op_store_ffff,
+} Op;
+
+#if defined(__clang__)
+    template <int N, typename T> using Vec = T __attribute__((ext_vector_type(N)));
+#elif defined(__GNUC__)
+    // For some reason GCC accepts this nonsense, but not the more straightforward version,
+    //   template <int N, typename T> using Vec = T __attribute__((vector_size(N*sizeof(T))));
+    template <int N, typename T>
+    struct VecHelper { typedef T __attribute__((vector_size(N*sizeof(T)))) V; };
+
+    template <int N, typename T> using Vec = typename VecHelper<N,T>::V;
+#endif
+
+// First, instantiate our default exec_ops() implementation using the default compiliation target.
+
+namespace baseline {
+#if defined(SKCMS_PORTABLE) || !(defined(__clang__) || defined(__GNUC__)) \
+                            || (defined(__EMSCRIPTEN_major__) && !defined(__wasm_simd128__))
+    #define N 1
+    template <typename T> using V = T;
+    using Color = float;
+#elif defined(__AVX512F__) && defined(__AVX512DQ__)
+    #define N 16
+    template <typename T> using V = Vec<N,T>;
+    using Color = float;
+#elif defined(__AVX__)
+    #define N 8
+    template <typename T> using V = Vec<N,T>;
+    using Color = float;
+#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
+    #define N 8
+    template <typename T> using V = Vec<N,T>;
+    using Color = _Float16;
+#else
+    #define N 4
+    template <typename T> using V = Vec<N,T>;
+    using Color = float;
+#endif
+
+    #include "src/Transform_inl.h"
+    #undef N
+}
+
+// Now, instantiate any other versions of run_program() we may want for runtime detection.
+#if !defined(SKCMS_PORTABLE) &&                           \
+    !defined(SKCMS_NO_RUNTIME_CPU_DETECTION) &&           \
+        (( defined(__clang__) && __clang_major__ >= 5) || \
+         (!defined(__clang__) && defined(__GNUC__)))      \
+     && defined(__x86_64__)
+
+    #if !defined(__AVX2__)
+        #if defined(__clang__)
+            #pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
+        #elif defined(__GNUC__)
+            #pragma GCC push_options
+            #pragma GCC target("avx2,f16c")
+        #endif
+
+        namespace hsw {
+            #define USING_AVX
+            #define USING_AVX_F16C
+            #define USING_AVX2
+            #define N 8
+            template <typename T> using V = Vec<N,T>;
+            using Color = float;
+
+            #include "src/Transform_inl.h"
+
+            // src/Transform_inl.h will undefine USING_* for us.
+            #undef N
+        }
+
+        #if defined(__clang__)
+            #pragma clang attribute pop
+        #elif defined(__GNUC__)
+            #pragma GCC pop_options
+        #endif
+
+        #define TEST_FOR_HSW
+    #endif
+
+    #if !defined(__AVX512F__) || !defined(__AVX512DQ__)
+        #if defined(__clang__)
+            #pragma clang attribute push(__attribute__((target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl"))), apply_to=function)
+        #elif defined(__GNUC__)
+            #pragma GCC push_options
+            #pragma GCC target("avx512f,avx512dq,avx512cd,avx512bw,avx512vl")
+        #endif
+
+        namespace skx {
+            #define USING_AVX512F
+            #define N 16
+            template <typename T> using V = Vec<N,T>;
+            using Color = float;
+
+            #include "src/Transform_inl.h"
+
+            // src/Transform_inl.h will undefine USING_* for us.
+            #undef N
+        }
+
+        #if defined(__clang__)
+            #pragma clang attribute pop
+        #elif defined(__GNUC__)
+            #pragma GCC pop_options
+        #endif
+
+        #define TEST_FOR_SKX
+    #endif
+
+    #if defined(TEST_FOR_HSW) || defined(TEST_FOR_SKX)
+        enum class CpuType { None, HSW, SKX };
+        static CpuType cpu_type() {
+            static const CpuType type = []{
+                if (!runtime_cpu_detection) {
+                    return CpuType::None;
+                }
+                // See http://www.sandpile.org/x86/cpuid.htm
+
+                // First, a basic cpuid(1) lets us check prerequisites for HSW, SKX.
+                uint32_t eax, ebx, ecx, edx;
+                __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                                             : "0"(1), "2"(0));
+                if ((edx & (1u<<25)) &&  // SSE
+                    (edx & (1u<<26)) &&  // SSE2
+                    (ecx & (1u<< 0)) &&  // SSE3
+                    (ecx & (1u<< 9)) &&  // SSSE3
+                    (ecx & (1u<<12)) &&  // FMA (N.B. not used, avoided even)
+                    (ecx & (1u<<19)) &&  // SSE4.1
+                    (ecx & (1u<<20)) &&  // SSE4.2
+                    (ecx & (1u<<26)) &&  // XSAVE
+                    (ecx & (1u<<27)) &&  // OSXSAVE
+                    (ecx & (1u<<28)) &&  // AVX
+                    (ecx & (1u<<29))) {  // F16C
+
+                    // Call cpuid(7) to check for AVX2 and AVX-512 bits.
+                    __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                                                 : "0"(7), "2"(0));
+                    // eax from xgetbv(0) will tell us whether XMM, YMM, and ZMM state is saved.
+                    uint32_t xcr0, dont_need_edx;
+                    __asm__ __volatile__("xgetbv" : "=a"(xcr0), "=d"(dont_need_edx) : "c"(0));
+
+                    if ((xcr0 & (1u<<1)) &&  // XMM register state saved?
+                        (xcr0 & (1u<<2)) &&  // YMM register state saved?
+                        (ebx  & (1u<<5))) {  // AVX2
+                        // At this point we're at least HSW.  Continue checking for SKX.
+                        if ((xcr0 & (1u<< 5)) && // Opmasks state saved?
+                            (xcr0 & (1u<< 6)) && // First 16 ZMM registers saved?
+                            (xcr0 & (1u<< 7)) && // High 16 ZMM registers saved?
+                            (ebx  & (1u<<16)) && // AVX512F
+                            (ebx  & (1u<<17)) && // AVX512DQ
+                            (ebx  & (1u<<28)) && // AVX512CD
+                            (ebx  & (1u<<30)) && // AVX512BW
+                            (ebx  & (1u<<31))) { // AVX512VL
+                            return CpuType::SKX;
+                        }
+                        return CpuType::HSW;
+                    }
+                }
+                return CpuType::None;
+            }();
+            return type;
+        }
+    #endif
+
+#endif
+
+typedef struct {
+    Op          op;
+    const void* arg;
+} OpAndArg;
+
+static OpAndArg select_curve_op(const skcms_Curve* curve, int channel) {
+    static const struct { Op sRGBish, PQish, HLGish, HLGinvish, table; } ops[] = {
+        { Op_tf_r, Op_pq_r, Op_hlg_r, Op_hlginv_r, Op_table_r },
+        { Op_tf_g, Op_pq_g, Op_hlg_g, Op_hlginv_g, Op_table_g },
+        { Op_tf_b, Op_pq_b, Op_hlg_b, Op_hlginv_b, Op_table_b },
+        { Op_tf_a, Op_pq_a, Op_hlg_a, Op_hlginv_a, Op_table_a },
+    };
+    const auto& op = ops[channel];
+
+    if (curve->table_entries == 0) {
+        const OpAndArg noop = { Op_load_a8/*doesn't matter*/, nullptr };
+
+        const skcms_TransferFunction& tf = curve->parametric;
+
+        if (tf.g == 1 && tf.a == 1 &&
+            tf.b == 0 && tf.c == 0 && tf.d == 0 && tf.e == 0 && tf.f == 0) {
+            return noop;
+        }
+
+        switch (classify(tf)) {
+            case skcms_TFType_Invalid:    return noop;
+            case skcms_TFType_sRGBish:    return OpAndArg{op.sRGBish,   &tf};
+            case skcms_TFType_PQish:      return OpAndArg{op.PQish,     &tf};
+            case skcms_TFType_HLGish:     return OpAndArg{op.HLGish,    &tf};
+            case skcms_TFType_HLGinvish:  return OpAndArg{op.HLGinvish, &tf};
+        }
+    }
+    return OpAndArg{op.table, curve};
+}
+
+static size_t bytes_per_pixel(skcms_PixelFormat fmt) {
+    switch (fmt >> 1) {   // ignore rgb/bgr
+        case skcms_PixelFormat_A_8                >> 1: return  1;
+        case skcms_PixelFormat_G_8                >> 1: return  1;
+        case skcms_PixelFormat_RGBA_8888_Palette8 >> 1: return  1;
+        case skcms_PixelFormat_ABGR_4444          >> 1: return  2;
+        case skcms_PixelFormat_RGB_565            >> 1: return  2;
+        case skcms_PixelFormat_RGB_888            >> 1: return  3;
+        case skcms_PixelFormat_RGBA_8888          >> 1: return  4;
+        case skcms_PixelFormat_RGBA_8888_sRGB     >> 1: return  4;
+        case skcms_PixelFormat_RGBA_1010102       >> 1: return  4;
+        case skcms_PixelFormat_RGB_101010x_XR     >> 1: return  4;
+        case skcms_PixelFormat_RGB_161616LE       >> 1: return  6;
+        case skcms_PixelFormat_RGBA_16161616LE    >> 1: return  8;
+        case skcms_PixelFormat_RGB_161616BE       >> 1: return  6;
+        case skcms_PixelFormat_RGBA_16161616BE    >> 1: return  8;
+        case skcms_PixelFormat_RGB_hhh_Norm       >> 1: return  6;
+        case skcms_PixelFormat_RGBA_hhhh_Norm     >> 1: return  8;
+        case skcms_PixelFormat_RGB_hhh            >> 1: return  6;
+        case skcms_PixelFormat_RGBA_hhhh          >> 1: return  8;
+        case skcms_PixelFormat_RGB_fff            >> 1: return 12;
+        case skcms_PixelFormat_RGBA_ffff          >> 1: return 16;
+    }
+    assert(false);
+    return 0;
+}
+
+static bool prep_for_destination(const skcms_ICCProfile* profile,
+                                 skcms_Matrix3x3* fromXYZD50,
+                                 skcms_TransferFunction* invR,
+                                 skcms_TransferFunction* invG,
+                                 skcms_TransferFunction* invB) {
+    // skcms_Transform() supports B2A destinations...
+    if (profile->has_B2A) { return true; }
+    // ...and destinations with parametric transfer functions and an XYZD50 gamut matrix.
+    return profile->has_trc
+        && profile->has_toXYZD50
+        && profile->trc[0].table_entries == 0
+        && profile->trc[1].table_entries == 0
+        && profile->trc[2].table_entries == 0
+        && skcms_TransferFunction_invert(&profile->trc[0].parametric, invR)
+        && skcms_TransferFunction_invert(&profile->trc[1].parametric, invG)
+        && skcms_TransferFunction_invert(&profile->trc[2].parametric, invB)
+        && skcms_Matrix3x3_invert(&profile->toXYZD50, fromXYZD50);
+}
+
+bool skcms_Transform(const void*             src,
+                     skcms_PixelFormat       srcFmt,
+                     skcms_AlphaFormat       srcAlpha,
+                     const skcms_ICCProfile* srcProfile,
+                     void*                   dst,
+                     skcms_PixelFormat       dstFmt,
+                     skcms_AlphaFormat       dstAlpha,
+                     const skcms_ICCProfile* dstProfile,
+                     size_t                  npixels) {
+    return skcms_TransformWithPalette(src, srcFmt, srcAlpha, srcProfile,
+                                      dst, dstFmt, dstAlpha, dstProfile,
+                                      npixels, nullptr);
+}
+
+bool skcms_TransformWithPalette(const void*             src,
+                                skcms_PixelFormat       srcFmt,
+                                skcms_AlphaFormat       srcAlpha,
+                                const skcms_ICCProfile* srcProfile,
+                                void*                   dst,
+                                skcms_PixelFormat       dstFmt,
+                                skcms_AlphaFormat       dstAlpha,
+                                const skcms_ICCProfile* dstProfile,
+                                size_t                  nz,
+                                const void*             palette) {
+    const size_t dst_bpp = bytes_per_pixel(dstFmt),
+                 src_bpp = bytes_per_pixel(srcFmt);
+    // Let's just refuse if the request is absurdly big.
+    if (nz * dst_bpp > INT_MAX || nz * src_bpp > INT_MAX) {
+        return false;
+    }
+    int n = (int)nz;
+
+    // Null profiles default to sRGB. Passing null for both is handy when doing format conversion.
+    if (!srcProfile) {
+        srcProfile = skcms_sRGB_profile();
+    }
+    if (!dstProfile) {
+        dstProfile = skcms_sRGB_profile();
+    }
+
+    // We can't transform in place unless the PixelFormats are the same size.
+    if (dst == src && dst_bpp != src_bpp) {
+        return false;
+    }
+    // TODO: more careful alias rejection (like, dst == src + 1)?
+
+    if (needs_palette(srcFmt) && !palette) {
+        return false;
+    }
+
+    Op          program  [32];
+    const void* arguments[32];
+
+    Op*          ops  = program;
+    const void** args = arguments;
+
+    // These are always parametric curves of some sort.
+    skcms_Curve dst_curves[3];
+    dst_curves[0].table_entries =
+    dst_curves[1].table_entries =
+    dst_curves[2].table_entries = 0;
+
+    skcms_Matrix3x3        from_xyz;
+
+    switch (srcFmt >> 1) {
+        default: return false;
+        case skcms_PixelFormat_A_8             >> 1: *ops++ = Op_load_a8;         break;
+        case skcms_PixelFormat_G_8             >> 1: *ops++ = Op_load_g8;         break;
+        case skcms_PixelFormat_ABGR_4444       >> 1: *ops++ = Op_load_4444;       break;
+        case skcms_PixelFormat_RGB_565         >> 1: *ops++ = Op_load_565;        break;
+        case skcms_PixelFormat_RGB_888         >> 1: *ops++ = Op_load_888;        break;
+        case skcms_PixelFormat_RGBA_8888       >> 1: *ops++ = Op_load_8888;       break;
+        case skcms_PixelFormat_RGBA_1010102    >> 1: *ops++ = Op_load_1010102;    break;
+        case skcms_PixelFormat_RGB_101010x_XR  >> 1: *ops++ = Op_load_101010x_XR; break;
+        case skcms_PixelFormat_RGB_161616LE    >> 1: *ops++ = Op_load_161616LE;   break;
+        case skcms_PixelFormat_RGBA_16161616LE >> 1: *ops++ = Op_load_16161616LE; break;
+        case skcms_PixelFormat_RGB_161616BE    >> 1: *ops++ = Op_load_161616BE;   break;
+        case skcms_PixelFormat_RGBA_16161616BE >> 1: *ops++ = Op_load_16161616BE; break;
+        case skcms_PixelFormat_RGB_hhh_Norm    >> 1: *ops++ = Op_load_hhh;        break;
+        case skcms_PixelFormat_RGBA_hhhh_Norm  >> 1: *ops++ = Op_load_hhhh;       break;
+        case skcms_PixelFormat_RGB_hhh         >> 1: *ops++ = Op_load_hhh;        break;
+        case skcms_PixelFormat_RGBA_hhhh       >> 1: *ops++ = Op_load_hhhh;       break;
+        case skcms_PixelFormat_RGB_fff         >> 1: *ops++ = Op_load_fff;        break;
+        case skcms_PixelFormat_RGBA_ffff       >> 1: *ops++ = Op_load_ffff;       break;
+
+        case skcms_PixelFormat_RGBA_8888_Palette8 >> 1: *ops++  = Op_load_8888_palette8;
+                                                        *args++ = palette;
+                                                        break;
+        case skcms_PixelFormat_RGBA_8888_sRGB >> 1:
+            *ops++ = Op_load_8888;
+            *ops++ = Op_tf_r;       *args++ = skcms_sRGB_TransferFunction();
+            *ops++ = Op_tf_g;       *args++ = skcms_sRGB_TransferFunction();
+            *ops++ = Op_tf_b;       *args++ = skcms_sRGB_TransferFunction();
+            break;
+    }
+    if (srcFmt == skcms_PixelFormat_RGB_hhh_Norm ||
+        srcFmt == skcms_PixelFormat_RGBA_hhhh_Norm) {
+        *ops++ = Op_clamp;
+    }
+    if (srcFmt & 1) {
+        *ops++ = Op_swap_rb;
+    }
+    skcms_ICCProfile gray_dst_profile;
+    if ((dstFmt >> 1) == (skcms_PixelFormat_G_8 >> 1)) {
+        // When transforming to gray, stop at XYZ (by setting toXYZ to identity), then transform
+        // luminance (Y) by the destination transfer function.
+        gray_dst_profile = *dstProfile;
+        skcms_SetXYZD50(&gray_dst_profile, &skcms_XYZD50_profile()->toXYZD50);
+        dstProfile = &gray_dst_profile;
+    }
+
+    if (srcProfile->data_color_space == skcms_Signature_CMYK) {
+        // Photoshop creates CMYK images as inverse CMYK.
+        // These happen to be the only ones we've _ever_ seen.
+        *ops++ = Op_invert;
+        // With CMYK, ignore the alpha type, to avoid changing K or conflating CMY with K.
+        srcAlpha = skcms_AlphaFormat_Unpremul;
+    }
+
+    if (srcAlpha == skcms_AlphaFormat_Opaque) {
+        *ops++ = Op_force_opaque;
+    } else if (srcAlpha == skcms_AlphaFormat_PremulAsEncoded) {
+        *ops++ = Op_unpremul;
+    }
+
+    if (dstProfile != srcProfile) {
+
+        if (!prep_for_destination(dstProfile,
+                                  &from_xyz,
+                                  &dst_curves[0].parametric,
+                                  &dst_curves[1].parametric,
+                                  &dst_curves[2].parametric)) {
+            return false;
+        }
+
+        if (srcProfile->has_A2B) {
+            if (srcProfile->A2B.input_channels) {
+                for (int i = 0; i < (int)srcProfile->A2B.input_channels; i++) {
+                    OpAndArg oa = select_curve_op(&srcProfile->A2B.input_curves[i], i);
+                    if (oa.arg) {
+                        *ops++  = oa.op;
+                        *args++ = oa.arg;
+                    }
+                }
+                *ops++  = Op_clamp;
+                *ops++  = Op_clut_A2B;
+                *args++ = &srcProfile->A2B;
+            }
+
+            if (srcProfile->A2B.matrix_channels == 3) {
+                for (int i = 0; i < 3; i++) {
+                    OpAndArg oa = select_curve_op(&srcProfile->A2B.matrix_curves[i], i);
+                    if (oa.arg) {
+                        *ops++  = oa.op;
+                        *args++ = oa.arg;
+                    }
+                }
+
+                static const skcms_Matrix3x4 I = {{
+                    {1,0,0,0},
+                    {0,1,0,0},
+                    {0,0,1,0},
+                }};
+                if (0 != memcmp(&I, &srcProfile->A2B.matrix, sizeof(I))) {
+                    *ops++  = Op_matrix_3x4;
+                    *args++ = &srcProfile->A2B.matrix;
+                }
+            }
+
+            if (srcProfile->A2B.output_channels == 3) {
+                for (int i = 0; i < 3; i++) {
+                    OpAndArg oa = select_curve_op(&srcProfile->A2B.output_curves[i], i);
+                    if (oa.arg) {
+                        *ops++  = oa.op;
+                        *args++ = oa.arg;
+                    }
+                }
+            }
+
+            if (srcProfile->pcs == skcms_Signature_Lab) {
+                *ops++ = Op_lab_to_xyz;
+            }
+
+        } else if (srcProfile->has_trc && srcProfile->has_toXYZD50) {
+            for (int i = 0; i < 3; i++) {
+                OpAndArg oa = select_curve_op(&srcProfile->trc[i], i);
+                if (oa.arg) {
+                    *ops++  = oa.op;
+                    *args++ = oa.arg;
+                }
+            }
+        } else {
+            return false;
+        }
+
+        // A2B sources are in XYZD50 by now, but TRC sources are still in their original gamut.
+        assert (srcProfile->has_A2B || srcProfile->has_toXYZD50);
+
+        if (dstProfile->has_B2A) {
+            // B2A needs its input in XYZD50, so transform TRC sources now.
+            if (!srcProfile->has_A2B) {
+                *ops++  = Op_matrix_3x3;
+                *args++ = &srcProfile->toXYZD50;
+            }
+
+            if (dstProfile->pcs == skcms_Signature_Lab) {
+                *ops++ = Op_xyz_to_lab;
+            }
+
+            if (dstProfile->B2A.input_channels == 3) {
+                for (int i = 0; i < 3; i++) {
+                    OpAndArg oa = select_curve_op(&dstProfile->B2A.input_curves[i], i);
+                    if (oa.arg) {
+                        *ops++  = oa.op;
+                        *args++ = oa.arg;
+                    }
+                }
+            }
+
+            if (dstProfile->B2A.matrix_channels == 3) {
+                static const skcms_Matrix3x4 I = {{
+                    {1,0,0,0},
+                    {0,1,0,0},
+                    {0,0,1,0},
+                }};
+                if (0 != memcmp(&I, &dstProfile->B2A.matrix, sizeof(I))) {
+                    *ops++  = Op_matrix_3x4;
+                    *args++ = &dstProfile->B2A.matrix;
+                }
+
+                for (int i = 0; i < 3; i++) {
+                    OpAndArg oa = select_curve_op(&dstProfile->B2A.matrix_curves[i], i);
+                    if (oa.arg) {
+                        *ops++  = oa.op;
+                        *args++ = oa.arg;
+                    }
+                }
+            }
+
+            if (dstProfile->B2A.output_channels) {
+                *ops++  = Op_clamp;
+                *ops++  = Op_clut_B2A;
+                *args++ = &dstProfile->B2A;
+                for (int i = 0; i < (int)dstProfile->B2A.output_channels; i++) {
+                    OpAndArg oa = select_curve_op(&dstProfile->B2A.output_curves[i], i);
+                    if (oa.arg) {
+                        *ops++  = oa.op;
+                        *args++ = oa.arg;
+                    }
+                }
+            }
+        } else {
+            // This is a TRC destination.
+            // We'll concat any src->xyz matrix with our xyz->dst matrix into one src->dst matrix.
+            // (A2B sources are already in XYZD50, making that src->xyz matrix I.)
+            static const skcms_Matrix3x3 I = {{
+                { 1.0f, 0.0f, 0.0f },
+                { 0.0f, 1.0f, 0.0f },
+                { 0.0f, 0.0f, 1.0f },
+            }};
+            const skcms_Matrix3x3* to_xyz = srcProfile->has_A2B ? &I : &srcProfile->toXYZD50;
+
+            // There's a chance the source and destination gamuts are identical,
+            // in which case we can skip the gamut transform.
+            if (0 != memcmp(&dstProfile->toXYZD50, to_xyz, sizeof(skcms_Matrix3x3))) {
+                // Concat the entire gamut transform into from_xyz,
+                // now slightly misnamed but it's a handy spot to stash the result.
+                from_xyz = skcms_Matrix3x3_concat(&from_xyz, to_xyz);
+                *ops++  = Op_matrix_3x3;
+                *args++ = &from_xyz;
+            }
+
+            // Encode back to dst RGB using its parametric transfer functions.
+            for (int i = 0; i < 3; i++) {
+                OpAndArg oa = select_curve_op(dst_curves+i, i);
+                if (oa.arg) {
+                    assert (oa.op != Op_table_r &&
+                            oa.op != Op_table_g &&
+                            oa.op != Op_table_b &&
+                            oa.op != Op_table_a);
+                    *ops++  = oa.op;
+                    *args++ = oa.arg;
+                }
+            }
+        }
+    }
+
+    // Clamp here before premul to make sure we're clamping to normalized values _and_ gamut,
+    // not just to values that fit in [0,1].
+    //
+    // E.g. r = 1.1, a = 0.5 would fit fine in fixed point after premul (ra=0.55,a=0.5),
+    // but would be carrying r > 1, which is really unexpected for downstream consumers.
+    if (dstFmt < skcms_PixelFormat_RGB_hhh) {
+        *ops++ = Op_clamp;
+    }
+
+    if (dstProfile->data_color_space == skcms_Signature_CMYK) {
+        // Photoshop creates CMYK images as inverse CMYK.
+        // These happen to be the only ones we've _ever_ seen.
+        *ops++ = Op_invert;
+
+        // CMYK has no alpha channel, so make sure dstAlpha is a no-op.
+        dstAlpha = skcms_AlphaFormat_Unpremul;
+    }
+
+    if (dstAlpha == skcms_AlphaFormat_Opaque) {
+        *ops++ = Op_force_opaque;
+    } else if (dstAlpha == skcms_AlphaFormat_PremulAsEncoded) {
+        *ops++ = Op_premul;
+    }
+    if (dstFmt & 1) {
+        *ops++ = Op_swap_rb;
+    }
+    switch (dstFmt >> 1) {
+        default: return false;
+        case skcms_PixelFormat_A_8             >> 1: *ops++ = Op_store_a8;         break;
+        case skcms_PixelFormat_G_8             >> 1: *ops++ = Op_store_g8;         break;
+        case skcms_PixelFormat_ABGR_4444       >> 1: *ops++ = Op_store_4444;       break;
+        case skcms_PixelFormat_RGB_565         >> 1: *ops++ = Op_store_565;        break;
+        case skcms_PixelFormat_RGB_888         >> 1: *ops++ = Op_store_888;        break;
+        case skcms_PixelFormat_RGBA_8888       >> 1: *ops++ = Op_store_8888;       break;
+        case skcms_PixelFormat_RGBA_1010102    >> 1: *ops++ = Op_store_1010102;    break;
+        case skcms_PixelFormat_RGB_161616LE    >> 1: *ops++ = Op_store_161616LE;   break;
+        case skcms_PixelFormat_RGBA_16161616LE >> 1: *ops++ = Op_store_16161616LE; break;
+        case skcms_PixelFormat_RGB_161616BE    >> 1: *ops++ = Op_store_161616BE;   break;
+        case skcms_PixelFormat_RGBA_16161616BE >> 1: *ops++ = Op_store_16161616BE; break;
+        case skcms_PixelFormat_RGB_hhh_Norm    >> 1: *ops++ = Op_store_hhh;        break;
+        case skcms_PixelFormat_RGBA_hhhh_Norm  >> 1: *ops++ = Op_store_hhhh;       break;
+        case skcms_PixelFormat_RGB_101010x_XR  >> 1: *ops++ = Op_store_101010x_XR; break;
+        case skcms_PixelFormat_RGB_hhh         >> 1: *ops++ = Op_store_hhh;        break;
+        case skcms_PixelFormat_RGBA_hhhh       >> 1: *ops++ = Op_store_hhhh;       break;
+        case skcms_PixelFormat_RGB_fff         >> 1: *ops++ = Op_store_fff;        break;
+        case skcms_PixelFormat_RGBA_ffff       >> 1: *ops++ = Op_store_ffff;       break;
+
+        case skcms_PixelFormat_RGBA_8888_sRGB >> 1:
+            *ops++ = Op_tf_r;       *args++ = skcms_sRGB_Inverse_TransferFunction();
+            *ops++ = Op_tf_g;       *args++ = skcms_sRGB_Inverse_TransferFunction();
+            *ops++ = Op_tf_b;       *args++ = skcms_sRGB_Inverse_TransferFunction();
+            *ops++ = Op_store_8888;
+            break;
+    }
+
+    auto run = baseline::run_program;
+#if defined(TEST_FOR_HSW)
+    switch (cpu_type()) {
+        case CpuType::None:                        break;
+        case CpuType::HSW: run = hsw::run_program; break;
+        case CpuType::SKX: run = hsw::run_program; break;
+    }
+#endif
+#if defined(TEST_FOR_SKX)
+    switch (cpu_type()) {
+        case CpuType::None:                        break;
+        case CpuType::HSW:                         break;
+        case CpuType::SKX: run = skx::run_program; break;
+    }
+#endif
+    run(program, arguments, (const char*)src, (char*)dst, n, src_bpp,dst_bpp);
+    return true;
+}
+
+static void assert_usable_as_destination(const skcms_ICCProfile* profile) {
+#if defined(NDEBUG)
+    (void)profile;
+#else
+    skcms_Matrix3x3 fromXYZD50;
+    skcms_TransferFunction invR, invG, invB;
+    assert(prep_for_destination(profile, &fromXYZD50, &invR, &invG, &invB));
+#endif
+}
+
+bool skcms_MakeUsableAsDestination(skcms_ICCProfile* profile) {
+    if (!profile->has_B2A) {
+        skcms_Matrix3x3 fromXYZD50;
+        if (!profile->has_trc || !profile->has_toXYZD50
+            || !skcms_Matrix3x3_invert(&profile->toXYZD50, &fromXYZD50)) {
+            return false;
+        }
+
+        skcms_TransferFunction tf[3];
+        for (int i = 0; i < 3; i++) {
+            skcms_TransferFunction inv;
+            if (profile->trc[i].table_entries == 0
+                && skcms_TransferFunction_invert(&profile->trc[i].parametric, &inv)) {
+                tf[i] = profile->trc[i].parametric;
+                continue;
+            }
+
+            float max_error;
+            // Parametric curves from skcms_ApproximateCurve() are guaranteed to be invertible.
+            if (!skcms_ApproximateCurve(&profile->trc[i], &tf[i], &max_error)) {
+                return false;
+            }
+        }
+
+        for (int i = 0; i < 3; ++i) {
+            profile->trc[i].table_entries = 0;
+            profile->trc[i].parametric = tf[i];
+        }
+    }
+    assert_usable_as_destination(profile);
+    return true;
+}
+
+bool skcms_MakeUsableAsDestinationWithSingleCurve(skcms_ICCProfile* profile) {
+    // Call skcms_MakeUsableAsDestination() with B2A disabled;
+    // on success that'll return a TRC/XYZ profile with three skcms_TransferFunctions.
+    skcms_ICCProfile result = *profile;
+    result.has_B2A = false;
+    if (!skcms_MakeUsableAsDestination(&result)) {
+        return false;
+    }
+
+    // Of the three, pick the transfer function that best fits the other two.
+    int best_tf = 0;
+    float min_max_error = INFINITY_;
+    for (int i = 0; i < 3; i++) {
+        skcms_TransferFunction inv;
+        if (!skcms_TransferFunction_invert(&result.trc[i].parametric, &inv)) {
+            return false;
+        }
+
+        float err = 0;
+        for (int j = 0; j < 3; ++j) {
+            err = fmaxf_(err, skcms_MaxRoundtripError(&profile->trc[j], &inv));
+        }
+        if (min_max_error > err) {
+            min_max_error = err;
+            best_tf = i;
+        }
+    }
+
+    for (int i = 0; i < 3; i++) {
+        result.trc[i].parametric = result.trc[best_tf].parametric;
+    }
+
+    *profile = result;
+    assert_usable_as_destination(profile);
+    return true;
+}
diff --git a/gfx/skia/skia/modules/skcms/skcms.gni b/gfx/skia/skia/modules/skcms/skcms.gni
new file mode 100644
index 0000000000..aa7daa2cf4
--- /dev/null
+++ b/gfx/skia/skia/modules/skcms/skcms.gni
@@ -0,0 +1,20 @@
+# DO NOT EDIT: This is a generated file.
+# See //bazel/exporter_tool/README.md for more information.
+#
+# The source of truth is //modules/skcms/BUILD.bazel
+
+# To update this file, run make -C bazel generate_gni
+
+_modules = get_path_info("../../modules", "abspath")
+
+# Generated by Bazel rule //modules/skcms:public_hdrs
+skcms_public_headers = [ "$_modules/skcms/skcms.h" ]
+
+# List generated by Bazel rules:
+#  //modules/skcms:srcs
+#  //modules/skcms:textual_hdrs
+skcms_sources = [
+  "$_modules/skcms/skcms.cc",
+  "$_modules/skcms/skcms_internal.h",
+  "$_modules/skcms/src/Transform_inl.h",
+]
diff --git a/gfx/skia/skia/modules/skcms/skcms.h b/gfx/skia/skia/modules/skcms/skcms.h
new file mode 100644
index 0000000000..322549b38f
--- /dev/null
+++ b/gfx/skia/skia/modules/skcms/skcms.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#pragma once
+
+// skcms.h contains the entire public API for skcms.
+
+#ifndef SKCMS_API
+    #define SKCMS_API
+#endif
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// A row-major 3x3 matrix (ie vals[row][col])
+typedef struct skcms_Matrix3x3 {
+    float vals[3][3];
+} skcms_Matrix3x3;
+
+// It is _not_ safe to alias the pointers to invert in-place.
+SKCMS_API bool            skcms_Matrix3x3_invert(const skcms_Matrix3x3*, skcms_Matrix3x3*);
+SKCMS_API skcms_Matrix3x3 skcms_Matrix3x3_concat(const skcms_Matrix3x3*, const skcms_Matrix3x3*);
+
+// A row-major 3x4 matrix (ie vals[row][col])
+typedef struct skcms_Matrix3x4 {
+    float vals[3][4];
+} skcms_Matrix3x4;
+
+// A transfer function mapping encoded values to linear values,
+// represented by this 7-parameter piecewise function:
+//
+//   linear = sign(encoded) *  (c*|encoded| + f)       , 0 <= |encoded| < d
+//          = sign(encoded) * ((a*|encoded| + b)^g + e), d <= |encoded|
+//
+// (A simple gamma transfer function sets g to gamma and a to 1.)
+typedef struct skcms_TransferFunction {
+    float g, a,b,c,d,e,f;
+} skcms_TransferFunction;
+
+SKCMS_API float skcms_TransferFunction_eval  (const skcms_TransferFunction*, float);
+SKCMS_API bool  skcms_TransferFunction_invert(const skcms_TransferFunction*,
+                                              skcms_TransferFunction*);
+
+typedef enum skcms_TFType {
+    skcms_TFType_Invalid,
+    skcms_TFType_sRGBish,
+    skcms_TFType_PQish,
+    skcms_TFType_HLGish,
+    skcms_TFType_HLGinvish,
+} skcms_TFType;
+
+// Identify which kind of transfer function is encoded in an skcms_TransferFunction
+SKCMS_API skcms_TFType skcms_TransferFunction_getType(const skcms_TransferFunction*);
+
+// We can jam a couple alternate transfer function forms into skcms_TransferFunction,
+// including those matching the general forms of the SMPTE ST 2084 PQ function or HLG.
+//
+// PQish:
+//                              max(A + B|encoded|^C, 0)
+//    linear = sign(encoded) * (------------------------) ^ F
+//                                  D + E|encoded|^C
+SKCMS_API bool skcms_TransferFunction_makePQish(skcms_TransferFunction*,
+                                                float A, float B, float C,
+                                                float D, float E, float F);
+// HLGish:
+//            { K * sign(encoded) * ( (R|encoded|)^G )          when 0   <= |encoded| <= 1/R
+//   linear = { K * sign(encoded) * ( e^(a(|encoded|-c)) + b )  when 1/R <  |encoded|
+SKCMS_API bool skcms_TransferFunction_makeScaledHLGish(skcms_TransferFunction*,
+                                                       float K, float R, float G,
+                                                       float a, float b, float c);
+
+// Compatibility shim with K=1 for old callers.
+static inline bool skcms_TransferFunction_makeHLGish(skcms_TransferFunction* fn,
+                                                     float R, float G,
+                                                     float a, float b, float c) {
+    return skcms_TransferFunction_makeScaledHLGish(fn, 1.0f, R,G, a,b,c);
+}
+
+// PQ mapping encoded [0,1] to linear [0,1].
+static inline bool skcms_TransferFunction_makePQ(skcms_TransferFunction* tf) {
+    return skcms_TransferFunction_makePQish(tf, -107/128.0f,         1.0f,   32/2523.0f
+                                              , 2413/128.0f, -2392/128.0f, 8192/1305.0f);
+}
+// HLG mapping encoded [0,1] to linear [0,12].
+static inline bool skcms_TransferFunction_makeHLG(skcms_TransferFunction* tf) {
+    return skcms_TransferFunction_makeHLGish(tf, 2.0f, 2.0f
+                                               , 1/0.17883277f, 0.28466892f, 0.55991073f);
+}
+
+// Is this an ordinary sRGB-ish transfer function, or one of the HDR forms we support?
+SKCMS_API bool skcms_TransferFunction_isSRGBish(const skcms_TransferFunction*);
+SKCMS_API bool skcms_TransferFunction_isPQish  (const skcms_TransferFunction*);
+SKCMS_API bool skcms_TransferFunction_isHLGish (const skcms_TransferFunction*);
+
+// Unified representation of 'curv' or 'para' tag data, or a 1D table from 'mft1' or 'mft2'
+typedef union skcms_Curve {
+    struct {
+        uint32_t alias_of_table_entries;
+        skcms_TransferFunction parametric;
+    };
+    struct {
+        uint32_t table_entries;
+        const uint8_t* table_8;
+        const uint8_t* table_16;
+    };
+} skcms_Curve;
+
+// Complex transforms between device space (A) and profile connection space (B):
+//   A2B:  device -> [ "A" curves -> CLUT ] -> [ "M" curves -> matrix ] -> "B" curves -> PCS
+//   B2A:  device <- [ "A" curves <- CLUT ] <- [ "M" curves <- matrix ] <- "B" curves <- PCS
+
+typedef struct skcms_A2B {
+    // Optional: N 1D "A" curves, followed by an N-dimensional CLUT.
+    // If input_channels == 0, these curves and CLUT are skipped,
+    // Otherwise, input_channels must be in [1, 4].
+    uint32_t        input_channels;
+    skcms_Curve     input_curves[4];
+    uint8_t         grid_points[4];
+    const uint8_t*  grid_8;
+    const uint8_t*  grid_16;
+
+    // Optional: 3 1D "M" curves, followed by a color matrix.
+    // If matrix_channels == 0, these curves and matrix are skipped,
+    // Otherwise, matrix_channels must be 3.
+    uint32_t        matrix_channels;
+    skcms_Curve     matrix_curves[3];
+    skcms_Matrix3x4 matrix;
+
+    // Required: 3 1D "B" curves. Always present, and output_channels must be 3.
+    uint32_t        output_channels;
+    skcms_Curve     output_curves[3];
+} skcms_A2B;
+
+typedef struct skcms_B2A {
+    // Required: 3 1D "B" curves. Always present, and input_channels must be 3.
+    uint32_t        input_channels;
+    skcms_Curve     input_curves[3];
+
+    // Optional: a color matrix, followed by 3 1D "M" curves.
+    // If matrix_channels == 0, this matrix and these curves are skipped,
+    // Otherwise, matrix_channels must be 3.
+    uint32_t        matrix_channels;
+    skcms_Matrix3x4 matrix;
+    skcms_Curve     matrix_curves[3];
+
+    // Optional: an N-dimensional CLUT, followed by N 1D "A" curves.
+    // If output_channels == 0, this CLUT and these curves are skipped,
+    // Otherwise, output_channels must be in [1, 4].
+    uint32_t        output_channels;
+    uint8_t         grid_points[4];
+    const uint8_t*  grid_8;
+    const uint8_t*  grid_16;
+    skcms_Curve     output_curves[4];
+} skcms_B2A;
+
+typedef struct skcms_CICP {
+    uint8_t color_primaries;
+    uint8_t transfer_characteristics;
+    uint8_t matrix_coefficients;
+    uint8_t video_full_range_flag;
+} skcms_CICP;
+
+typedef struct skcms_ICCProfile {
+    const uint8_t* buffer;
+
+    uint32_t size;
+    uint32_t data_color_space;
+    uint32_t pcs;
+    uint32_t tag_count;
+
+    // skcms_Parse() will set commonly-used fields for you when possible:
+
+    // If we can parse red, green and blue transfer curves from the profile,
+    // trc will be set to those three curves, and has_trc will be true.
+    bool                   has_trc;
+    skcms_Curve            trc[3];
+
+    // If this profile's gamut can be represented by a 3x3 transform to XYZD50,
+    // skcms_Parse() sets toXYZD50 to that transform and has_toXYZD50 to true.
+    bool                   has_toXYZD50;
+    skcms_Matrix3x3        toXYZD50;
+
+    // If the profile has a valid A2B0 or A2B1 tag, skcms_Parse() sets A2B to
+    // that data, and has_A2B to true.  skcms_ParseWithA2BPriority() does the
+    // same following any user-provided prioritization of A2B0, A2B1, or A2B2.
+    bool                   has_A2B;
+    skcms_A2B              A2B;
+
+    // If the profile has a valid B2A0 or B2A1 tag, skcms_Parse() sets B2A to
+    // that data, and has_B2A to true.  skcms_ParseWithA2BPriority() does the
+    // same following any user-provided prioritization of B2A0, B2A1, or B2A2.
+    bool                   has_B2A;
+    skcms_B2A              B2A;
+
+    // If the profile has a valid CICP tag, skcms_Parse() sets CICP to that data,
+    // and has_CICP to true.
+    bool                   has_CICP;
+    skcms_CICP             CICP;
+} skcms_ICCProfile;
+
+// The sRGB color profile is so commonly used that we offer a canonical skcms_ICCProfile for it.
+SKCMS_API const skcms_ICCProfile* skcms_sRGB_profile(void);
+// Ditto for XYZD50, the most common profile connection space.
+SKCMS_API const skcms_ICCProfile* skcms_XYZD50_profile(void);
+
+SKCMS_API const skcms_TransferFunction* skcms_sRGB_TransferFunction(void);
+SKCMS_API const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction(void);
+SKCMS_API const skcms_TransferFunction* skcms_Identity_TransferFunction(void);
+
+// Practical equality test for two skcms_ICCProfiles.
+// The implementation is subject to change, but it will always try to answer
+// "can I substitute A for B?" and "can I skip transforming from A to B?".
+SKCMS_API bool skcms_ApproximatelyEqualProfiles(const skcms_ICCProfile* A,
+                                                const skcms_ICCProfile* B);
+
+// Practical test that answers: Is curve roughly the inverse of inv_tf? Typically used by passing
+// the inverse of a known parametric transfer function (like sRGB), to determine if a particular
+// curve is very close to sRGB.
+SKCMS_API bool skcms_AreApproximateInverses(const skcms_Curve* curve,
+                                            const skcms_TransferFunction* inv_tf);
+
+// Similar to above, answering the question for all three TRC curves of the given profile. Again,
+// passing skcms_sRGB_InverseTransferFunction as inv_tf will answer the question:
+// "Does this profile have a transfer function that is very close to sRGB?"
+SKCMS_API bool skcms_TRCs_AreApproximateInverse(const skcms_ICCProfile* profile,
+                                                const skcms_TransferFunction* inv_tf);
+
+// Parse an ICC profile and return true if possible, otherwise return false.
+// Selects an A2B profile (if present) according to priority list (each entry 0-2).
+// The buffer is not copied; it must remain valid as long as the skcms_ICCProfile will be used.
+SKCMS_API bool skcms_ParseWithA2BPriority(const void*, size_t,
+                                          const int priority[], int priorities,
+                                          skcms_ICCProfile*);
+
+static inline bool skcms_Parse(const void* buf, size_t len, skcms_ICCProfile* profile) {
+    // For continuity of existing user expectations,
+    // prefer A2B0 (perceptual) over A2B1 (relative colormetric), and ignore A2B2 (saturation).
+    const int priority[] = {0,1};
+    return skcms_ParseWithA2BPriority(buf, len,
+                                      priority, sizeof(priority)/sizeof(*priority),
+                                      profile);
+}
+
+SKCMS_API bool skcms_ApproximateCurve(const skcms_Curve* curve,
+                                      skcms_TransferFunction* approx,
+                                      float* max_error);
+
+SKCMS_API bool skcms_GetCHAD(const skcms_ICCProfile*, skcms_Matrix3x3*);
+SKCMS_API bool skcms_GetWTPT(const skcms_ICCProfile*, float xyz[3]);
+
+// These are common ICC signature values
+enum {
+    // data_color_space
+    skcms_Signature_CMYK = 0x434D594B,
+    skcms_Signature_Gray = 0x47524159,
+    skcms_Signature_RGB  = 0x52474220,
+
+    // pcs
+    skcms_Signature_Lab  = 0x4C616220,
+    skcms_Signature_XYZ  = 0x58595A20,
+};
+
+typedef enum skcms_PixelFormat {
+    skcms_PixelFormat_A_8,
+    skcms_PixelFormat_A_8_,
+    skcms_PixelFormat_G_8,
+    skcms_PixelFormat_G_8_,
+    skcms_PixelFormat_RGBA_8888_Palette8,
+    skcms_PixelFormat_BGRA_8888_Palette8,
+
+    skcms_PixelFormat_RGB_565,
+    skcms_PixelFormat_BGR_565,
+
+    skcms_PixelFormat_ABGR_4444,
+    skcms_PixelFormat_ARGB_4444,
+
+    skcms_PixelFormat_RGB_888,
+    skcms_PixelFormat_BGR_888,
+    skcms_PixelFormat_RGBA_8888,
+    skcms_PixelFormat_BGRA_8888,
+    skcms_PixelFormat_RGBA_8888_sRGB,   // Automatic sRGB encoding / decoding.
+    skcms_PixelFormat_BGRA_8888_sRGB,   // (Generally used with linear transfer functions.)
+
+    skcms_PixelFormat_RGBA_1010102,
+    skcms_PixelFormat_BGRA_1010102,
+
+    skcms_PixelFormat_RGB_161616LE,     // Little-endian.  Pointers must be 16-bit aligned.
+    skcms_PixelFormat_BGR_161616LE,
+    skcms_PixelFormat_RGBA_16161616LE,
+    skcms_PixelFormat_BGRA_16161616LE,
+
+    skcms_PixelFormat_RGB_161616BE,     // Big-endian.  Pointers must be 16-bit aligned.
+    skcms_PixelFormat_BGR_161616BE,
+    skcms_PixelFormat_RGBA_16161616BE,
+    skcms_PixelFormat_BGRA_16161616BE,
+
+    skcms_PixelFormat_RGB_hhh_Norm,   // 1-5-10 half-precision float in [0,1]
+    skcms_PixelFormat_BGR_hhh_Norm,   // Pointers must be 16-bit aligned.
+    skcms_PixelFormat_RGBA_hhhh_Norm,
+    skcms_PixelFormat_BGRA_hhhh_Norm,
+
+    skcms_PixelFormat_RGB_hhh,        // 1-5-10 half-precision float.
+    skcms_PixelFormat_BGR_hhh,        // Pointers must be 16-bit aligned.
+    skcms_PixelFormat_RGBA_hhhh,
+    skcms_PixelFormat_BGRA_hhhh,
+
+    skcms_PixelFormat_RGB_fff,        // 1-8-23 single-precision float (the normal kind).
+    skcms_PixelFormat_BGR_fff,        // Pointers must be 32-bit aligned.
+    skcms_PixelFormat_RGBA_ffff,
+    skcms_PixelFormat_BGRA_ffff,
+
+    skcms_PixelFormat_RGB_101010x_XR,  // Note: This is located here to signal no clamping.
+    skcms_PixelFormat_BGR_101010x_XR,  // Compatible with MTLPixelFormatBGR10_XR.
+} skcms_PixelFormat;
+
+// We always store any alpha channel linearly.  In the chart below, tf-1() is the inverse
+// transfer function for the given color profile (applying the transfer function linearizes).
+
+// We treat opaque as a strong requirement, not just a performance hint: we will ignore
+// any source alpha and treat it as 1.0, and will make sure that any destination alpha
+// channel is filled with the equivalent of 1.0.
+
+// We used to offer multiple types of premultiplication, but now just one, PremulAsEncoded.
+// This is the premul you're probably used to working with.
+
+typedef enum skcms_AlphaFormat {
+    skcms_AlphaFormat_Opaque,          // alpha is always opaque
+                                       //   tf-1(r),   tf-1(g),   tf-1(b),   1.0
+    skcms_AlphaFormat_Unpremul,        // alpha and color are unassociated
+                                       //   tf-1(r),   tf-1(g),   tf-1(b),   a
+    skcms_AlphaFormat_PremulAsEncoded, // premultiplied while encoded
+                                       //   tf-1(r)*a, tf-1(g)*a, tf-1(b)*a, a
+} skcms_AlphaFormat;
+
+// Convert npixels pixels from src format and color profile to dst format and color profile
+// and return true, otherwise return false.  It is safe to alias dst == src if dstFmt == srcFmt.
+SKCMS_API bool skcms_Transform(const void*             src,
+                               skcms_PixelFormat       srcFmt,
+                               skcms_AlphaFormat       srcAlpha,
+                               const skcms_ICCProfile* srcProfile,
+                               void*                   dst,
+                               skcms_PixelFormat       dstFmt,
+                               skcms_AlphaFormat       dstAlpha,
+                               const skcms_ICCProfile* dstProfile,
+                               size_t                  npixels);
+
+// As skcms_Transform(), supporting srcFmts with a palette.
+SKCMS_API bool skcms_TransformWithPalette(const void*             src,
+                                          skcms_PixelFormat       srcFmt,
+                                          skcms_AlphaFormat       srcAlpha,
+                                          const skcms_ICCProfile* srcProfile,
+                                          void*                   dst,
+                                          skcms_PixelFormat       dstFmt,
+                                          skcms_AlphaFormat       dstAlpha,
+                                          const skcms_ICCProfile* dstProfile,
+                                          size_t                  npixels,
+                                          const void*             palette);
+
+// If profile can be used as a destination in skcms_Transform, return true. Otherwise, attempt to
+// rewrite it with approximations where reasonable. If successful, return true. If no reasonable
+// approximation exists, leave the profile unchanged and return false.
+SKCMS_API bool skcms_MakeUsableAsDestination(skcms_ICCProfile* profile);
+
+// If profile can be used as a destination with a single parametric transfer function (ie for
+// rasterization), return true. Otherwise, attempt to rewrite it with approximations where
+// reasonable. If successful, return true. If no reasonable approximation exists, leave the
+// profile unchanged and return false.
+SKCMS_API bool skcms_MakeUsableAsDestinationWithSingleCurve(skcms_ICCProfile* profile);
+
+// Returns a matrix to adapt XYZ color from given the whitepoint to D50.
+SKCMS_API bool skcms_AdaptToXYZD50(float wx, float wy,
+                                   skcms_Matrix3x3* toXYZD50);
+
+// Returns a matrix to convert RGB color into XYZ adapted to D50, given the
+// primaries and whitepoint of the RGB model.
+SKCMS_API bool skcms_PrimariesToXYZD50(float rx, float ry,
+                                       float gx, float gy,
+                                       float bx, float by,
+                                       float wx, float wy,
+                                       skcms_Matrix3x3* toXYZD50);
+
+// Call before your first call to skcms_Transform() to skip runtime CPU detection.
+SKCMS_API void skcms_DisableRuntimeCPUDetection(void);
+
+// Utilities for programmatically constructing profiles
+static inline void skcms_Init(skcms_ICCProfile* p) {
+    memset(p, 0, sizeof(*p));
+    p->data_color_space = skcms_Signature_RGB;
+    p->pcs = skcms_Signature_XYZ;
+}
+
+static inline void skcms_SetTransferFunction(skcms_ICCProfile* p,
+                                             const skcms_TransferFunction* tf) {
+    p->has_trc = true;
+    for (int i = 0; i < 3; ++i) {
+        p->trc[i].table_entries = 0;
+        p->trc[i].parametric = *tf;
+    }
+}
+
+static inline void skcms_SetXYZD50(skcms_ICCProfile* p, const skcms_Matrix3x3* m) {
+    p->has_toXYZD50 = true;
+    p->toXYZD50 = *m;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/gfx/skia/skia/modules/skcms/skcms_internal.h b/gfx/skia/skia/modules/skcms/skcms_internal.h
new file mode 100644
index 0000000000..cc6d578ba0
--- /dev/null
+++ b/gfx/skia/skia/modules/skcms/skcms_internal.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#pragma once
+
+// skcms_internal.h contains APIs shared by skcms' internals and its test tools.
+// Please don't use this header from outside the skcms repo.
+
+#include "skcms.h"
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ~~~~ General Helper Macros ~~~~
+    #define ARRAY_COUNT(arr) (int)(sizeof((arr)) / sizeof(*(arr)))
+
+    typedef struct skcms_ICCTag {
+        uint32_t       signature;
+        uint32_t       type;
+        uint32_t       size;
+        const uint8_t* buf;
+    } skcms_ICCTag;
+
+    void skcms_GetTagByIndex    (const skcms_ICCProfile*, uint32_t idx, skcms_ICCTag*);
+    bool skcms_GetTagBySignature(const skcms_ICCProfile*, uint32_t sig, skcms_ICCTag*);
+
+    float skcms_MaxRoundtripError(const skcms_Curve* curve, const skcms_TransferFunction* inv_tf);
+
+    // 252 of a random shuffle of all possible bytes.
+    // 252 is evenly divisible by 3 and 4.  Only 192, 10, 241, and 43 are missing.
+    // Used for ICC profile equivalence testing.
+    extern const uint8_t skcms_252_random_bytes[252];
+
+// ~~~~ Portable Math ~~~~
+    static inline float floorf_(float x) {
+        float roundtrip = (float)((int)x);
+        return roundtrip > x ? roundtrip - 1 : roundtrip;
+    }
+    static inline float fabsf_(float x) { return x < 0 ? -x : x; }
+    float powf_(float, float);
+
+// ~~~~ Does this pixel format need a palette pointer to be usable? ~~~~
+    static inline bool needs_palette(skcms_PixelFormat fmt) {
+        return (fmt >> 1) == (skcms_PixelFormat_RGBA_8888_Palette8 >> 1);
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/gfx/skia/skia/modules/skcms/src/Transform_inl.h b/gfx/skia/skia/modules/skcms/src/Transform_inl.h
new file mode 100644
index 0000000000..350f6a20a6
--- /dev/null
+++ b/gfx/skia/skia/modules/skcms/src/Transform_inl.h
@@ -0,0 +1,1628 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+// Intentionally NO #pragma once... included multiple times.
+
+// This file is included from skcms.cc in a namespace with some pre-defines:
+//    - N:    depth of all vectors, 1,4,8, or 16 (preprocessor define)
+//    - V<T>: a template to create a vector of N T's.
+
+using F   = V<Color>;   // Called F for historic reasons... maybe rename C?
+using I32 = V<int32_t>;
+using U64 = V<uint64_t>;
+using U32 = V<uint32_t>;
+using U16 = V<uint16_t>;
+using U8  = V<uint8_t>;
+
+
+#if defined(__GNUC__) && !defined(__clang__)
+    // Once again, GCC is kind of weird, not allowing vector = scalar directly.
+    static constexpr F F0 = F() + 0.0f,
+                       F1 = F() + 1.0f,
+                       FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
+#else
+    static constexpr F F0 = 0.0f,
+                       F1 = 1.0f,
+                       FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
+#endif
+
+// Instead of checking __AVX__ below, we'll check USING_AVX.
+// This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
+// Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
+
+#if !defined(USING_AVX)      && N == 8 && defined(__AVX__)
+    #define  USING_AVX
+#endif
+#if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
+    #define  USING AVX_F16C
+#endif
+#if !defined(USING_AVX2)     && defined(USING_AVX) && defined(__AVX2__)
+    #define  USING_AVX2
+#endif
+#if !defined(USING_AVX512F)  && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
+    #define  USING_AVX512F
+#endif
+
+// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
+// This is more for organizational clarity... skcms.cc doesn't force these.
+#if N > 1 && defined(__ARM_NEON)
+    #define USING_NEON
+    #if __ARM_FP & 2
+        #define USING_NEON_F16C
+    #endif
+    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
+        #define USING_NEON_FP16
+    #endif
+#endif
+
+// These -Wvector-conversion warnings seem to trigger in very bogus situations,
+// like vst3q_f32() expecting a 16x char rather than a 4x float vector.  :/
+#if defined(USING_NEON) && defined(__clang__)
+    #pragma clang diagnostic ignored "-Wvector-conversion"
+#endif
+
+// GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
+// You'd see warnings like, "using AVX even though AVX is not enabled".
+// We stifle these warnings; our helpers that return U64 are always inlined.
+#if defined(__SSE__) && defined(__GNUC__)
+    #if !defined(__has_warning)
+        #pragma GCC diagnostic ignored "-Wpsabi"
+    #elif __has_warning("-Wpsabi")
+        #pragma GCC diagnostic ignored "-Wpsabi"
+    #endif
+#endif
+
+#if defined(__clang__)
+    #define FALLTHROUGH [[clang::fallthrough]]
+#else
+    #define FALLTHROUGH
+#endif
+
+// We tag most helper functions as SI, to enforce good code generation
+// but also work around what we think is a bug in GCC: when targeting 32-bit
+// x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
+// MMX mm0 register, which seems to mess with unrelated code that later uses
+// x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
+//
+// It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
+#if defined(__clang__) || defined(__GNUC__)
+    #define SI static inline __attribute__((always_inline))
+#else
+    #define SI static inline
+#endif
+
+template <typename T, typename P>
+SI T load(const P* ptr) {
+    T val;
+    small_memcpy(&val, ptr, sizeof(val));
+    return val;
+}
+template <typename T, typename P>
+SI void store(P* ptr, const T& val) {
+    small_memcpy(ptr, &val, sizeof(val));
+}
+
+// (T)v is a cast when N == 1 and a bit-pun when N>1,
+// so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
+template <typename D, typename S>
+SI D cast(const S& v) {
+#if N == 1
+    return (D)v;
+#elif defined(__clang__)
+    return __builtin_convertvector(v, D);
+#else
+    D d;
+    for (int i = 0; i < N; i++) {
+        d[i] = v[i];
+    }
+    return d;
+#endif
+}
+
+template <typename D, typename S>
+SI D bit_pun(const S& v) {
+    static_assert(sizeof(D) == sizeof(v), "");
+    return load<D>(&v);
+}
+
+// When we convert from float to fixed point, it's very common to want to round,
+// and for some reason compilers generate better code when converting to int32_t.
+// To serve both those ends, we use this function to_fixed() instead of direct cast().
+#if defined(USING_NEON_FP16)
+    // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
+    SI U16 to_fixed(F f) {  return cast<U16>(f + 0.5f); }
+#else
+    SI U32 to_fixed(F f) {  return (U32)cast<I32>(f + 0.5f); }
+#endif
+
+
+// Sometimes we do something crazy on one branch of a conditonal,
+// like divide by zero or convert a huge float to an integer,
+// but then harmlessly select the other side.  That trips up N==1
+// sanitizer builds, so we make if_then_else() a macro to avoid
+// evaluating the unused side.
+
+#if N == 1
+    #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
+#else
+    template <typename C, typename T>
+    SI T if_then_else(C cond, T t, T e) {
+        return bit_pun<T>( ( cond & bit_pun<C>(t)) |
+                           (~cond & bit_pun<C>(e)) );
+    }
+#endif
+
+
+SI F F_from_Half(U16 half) {
+#if defined(USING_NEON_FP16)
+    return bit_pun<F>(half);
+#elif defined(USING_NEON_F16C)
+    return vcvt_f32_f16((float16x4_t)half);
+#elif defined(USING_AVX512F)
+    return (F)_mm512_cvtph_ps((__m256i)half);
+#elif defined(USING_AVX_F16C)
+    typedef int16_t __attribute__((vector_size(16))) I16;
+    return __builtin_ia32_vcvtph2ps256((I16)half);
+#else
+    U32 wide = cast<U32>(half);
+    // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
+    U32 s  = wide & 0x8000,
+        em = wide ^ s;
+
+    // Constructing the float is easy if the half is not denormalized.
+    F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
+
+    // Simply flush all denorm half floats to zero.
+    return if_then_else(em < 0x0400, F0, norm);
+#endif
+}
+
+#if defined(__clang__)
+    // The -((127-15)<<10) underflows that side of the math when
+    // we pass a denorm half float.  It's harmless... we'll take the 0 side anyway.
+    __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+SI U16 Half_from_F(F f) {
+#if defined(USING_NEON_FP16)
+    return bit_pun<U16>(f);
+#elif defined(USING_NEON_F16C)
+    return (U16)vcvt_f16_f32(f);
+#elif defined(USING_AVX512F)
+    return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
+#elif defined(USING_AVX_F16C)
+    return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
+#else
+    // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
+    U32 sem = bit_pun<U32>(f),
+        s   = sem & 0x80000000,
+         em = sem ^ s;
+
+    // For simplicity we flush denorm half floats (including all denorm floats) to zero.
+    return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
+                                                 , (s>>16) + (em>>13) - ((127-15)<<10)));
+#endif
+}
+
+// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
+#if defined(USING_NEON_FP16)
+    SI U16 swap_endian_16(U16 v) {
+        return (U16)vrev16q_u8((uint8x16_t) v);
+    }
+#elif defined(USING_NEON)
+    SI U16 swap_endian_16(U16 v) {
+        return (U16)vrev16_u8((uint8x8_t) v);
+    }
+#endif
+
+SI U64 swap_endian_16x4(const U64& rgba) {
+    return (rgba & 0x00ff00ff00ff00ff) << 8
+         | (rgba & 0xff00ff00ff00ff00) >> 8;
+}
+
+#if defined(USING_NEON_FP16)
+    SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
+    SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
+#elif defined(USING_NEON)
+    SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
+    SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
+#else
+    SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
+    SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
+#endif
+
+SI F floor_(F x) {
+#if N == 1
+    return floorf_(x);
+#elif defined(USING_NEON_FP16)
+    return vrndmq_f16(x);
+#elif defined(__aarch64__)
+    return vrndmq_f32(x);
+#elif defined(USING_AVX512F)
+    // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
+    // and integer santizer catches that this implicit cast changes the
+    // value from -1 to 65535.  We'll cast manually to work around it.
+    // Read this as `return _mm512_floor_ps(x)`.
+    return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
+#elif defined(USING_AVX)
+    return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
+#elif defined(__SSE4_1__)
+    return _mm_floor_ps(x);
+#else
+    // Round trip through integers with a truncating cast.
+    F roundtrip = cast<F>(cast<I32>(x));
+    // If x is negative, truncating gives the ceiling instead of the floor.
+    return roundtrip - if_then_else(roundtrip > x, F1, F0);
+
+    // This implementation fails for values of x that are outside
+    // the range an integer can represent.  We expect most x to be small.
+#endif
+}
+
+SI F approx_log2(F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    return x;
+#else
+    // The first approximation of log2(x) is its exponent 'e', minus 127.
+    I32 bits = bit_pun<I32>(x);
+
+    F e = cast<F>(bits) * (1.0f / (1<<23));
+
+    // If we use the mantissa too we can refine the error signficantly.
+    F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
+
+    return e - 124.225514990f
+             -   1.498030302f*m
+             -   1.725879990f/(0.3520887068f + m);
+#endif
+}
+
+SI F approx_log(F x) {
+    const float ln2 = 0.69314718f;
+    return ln2 * approx_log2(x);
+}
+
+SI F approx_exp2(F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    return x;
+#else
+    F fract = x - floor_(x);
+
+    F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
+                                    -   1.490129070f*fract
+                                    +  27.728023300f/(4.84252568f - fract));
+    I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
+
+    return bit_pun<F>(bits);
+#endif
+}
+
+SI F approx_pow(F x, float y) {
+    return if_then_else((x == F0) | (x == F1), x
+                                             , approx_exp2(approx_log2(x) * y));
+}
+
+SI F approx_exp(F x) {
+    const float log2_e = 1.4426950408889634074f;
+    return approx_exp2(log2_e * x);
+}
+
+// Return tf(x).
+SI F apply_tf(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    // Peel off the sign bit and set x = |x|.
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+
+    // The transfer function has a linear part up to d, exponential at d and after.
+    F v = if_then_else(x < tf->d,            tf->c*x + tf->f
+                                , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
+
+    // Tack the sign bit back on.
+    return bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+
+SI F apply_pq(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+
+    F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
+                       / (tf->d + tf->e * approx_pow(x, tf->c)),
+                     tf->f);
+
+    return bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+
+SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    const float R = tf->a, G = tf->b,
+                a = tf->c, b = tf->d, c = tf->e,
+                K = tf->f + 1;
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+
+    F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
+                               , approx_exp((x-c)*a) + b);
+
+    return K*bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+
+SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    const float R = tf->a, G = tf->b,
+                a = tf->c, b = tf->d, c = tf->e,
+                K = tf->f + 1;
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+    x /= K;
+
+    F v = if_then_else(x <= 1, R * approx_pow(x, G)
+                             , a * approx_log(x - b) + c);
+
+    return bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+
+
+// Strided loads and stores of N values, starting from p.
+template <typename T, typename P>
+SI T load_3(const P* p) {
+#if N == 1
+    return (T)p[0];
+#elif N == 4
+    return T{p[ 0],p[ 3],p[ 6],p[ 9]};
+#elif N == 8
+    return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
+#elif N == 16
+    return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
+             p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
+#endif
+}
+
+template <typename T, typename P>
+SI T load_4(const P* p) {
+#if N == 1
+    return (T)p[0];
+#elif N == 4
+    return T{p[ 0],p[ 4],p[ 8],p[12]};
+#elif N == 8
+    return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
+#elif N == 16
+    return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
+             p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
+#endif
+}
+
+template <typename T, typename P>
+SI void store_3(P* p, const T& v) {
+#if N == 1
+    p[0] = v;
+#elif N == 4
+    p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
+#elif N == 8
+    p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
+    p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
+#elif N == 16
+    p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
+    p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
+    p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
+    p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
+#endif
+}
+
+template <typename T, typename P>
+SI void store_4(P* p, const T& v) {
+#if N == 1
+    p[0] = v;
+#elif N == 4
+    p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
+#elif N == 8
+    p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
+    p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
+#elif N == 16
+    p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
+    p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
+    p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
+    p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
+#endif
+}
+
+
+SI U8 gather_8(const uint8_t* p, I32 ix) {
+#if N == 1
+    U8 v = p[ix];
+#elif N == 4
+    U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
+#elif N == 8
+    U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
+             p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
+#elif N == 16
+    U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
+             p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
+             p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
+             p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
+#endif
+    return v;
+}
+
+SI U16 gather_16(const uint8_t* p, I32 ix) {
+    // Load the i'th 16-bit value from p.
+    auto load_16 = [p](int i) {
+        return load<uint16_t>(p + 2*i);
+    };
+#if N == 1
+    U16 v = load_16(ix);
+#elif N == 4
+    U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
+#elif N == 8
+    U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
+              load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
+#elif N == 16
+    U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
+              load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
+              load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
+              load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
+#endif
+    return v;
+}
+
+SI U32 gather_32(const uint8_t* p, I32 ix) {
+    // Load the i'th 32-bit value from p.
+    auto load_32 = [p](int i) {
+        return load<uint32_t>(p + 4*i);
+    };
+#if N == 1
+    U32 v = load_32(ix);
+#elif N == 4
+    U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
+#elif N == 8
+    U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
+              load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
+#elif N == 16
+    U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
+              load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
+              load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
+              load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
+#endif
+    // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
+    return v;
+}
+
+SI U32 gather_24(const uint8_t* p, I32 ix) {
+    // First, back up a byte.  Any place we're gathering from has a safe junk byte to read
+    // in front of it, either a previous table value, or some tag metadata.
+    p -= 1;
+
+    // Load the i'th 24-bit value from p, and 1 extra byte.
+    auto load_24_32 = [p](int i) {
+        return load<uint32_t>(p + 3*i);
+    };
+
+    // Now load multiples of 4 bytes (a junk byte, then r,g,b).
+#if N == 1
+    U32 v = load_24_32(ix);
+#elif N == 4
+    U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
+#elif N == 8 && !defined(USING_AVX2)
+    U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
+              load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
+#elif N == 8
+    (void)load_24_32;
+    // The gather instruction here doesn't need any particular alignment,
+    // but the intrinsic takes a const int*.
+    const int* p4 = bit_pun<const int*>(p);
+    I32 zero = { 0, 0, 0, 0,  0, 0, 0, 0},
+        mask = {-1,-1,-1,-1, -1,-1,-1,-1};
+    #if defined(__clang__)
+        U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
+    #elif defined(__GNUC__)
+        U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
+    #endif
+#elif N == 16
+    (void)load_24_32;
+    // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
+    // And AVX-512 swapped the order of arguments.  :/
+    const int* p4 = bit_pun<const int*>(p);
+    U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
+#endif
+
+    // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
+    return v >> 8;
+}
+
+#if !defined(__arm__)
+    SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
+        // As in gather_24(), with everything doubled.
+        p -= 2;
+
+        // Load the i'th 48-bit value from p, and 2 extra bytes.
+        auto load_48_64 = [p](int i) {
+            return load<uint64_t>(p + 6*i);
+        };
+
+    #if N == 1
+        *v = load_48_64(ix);
+    #elif N == 4
+        *v = U64{
+            load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
+        };
+    #elif N == 8 && !defined(USING_AVX2)
+        *v = U64{
+            load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
+            load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
+        };
+    #elif N == 8
+        (void)load_48_64;
+        typedef int32_t   __attribute__((vector_size(16))) Half_I32;
+        typedef long long __attribute__((vector_size(32))) Half_I64;
+
+        // The gather instruction here doesn't need any particular alignment,
+        // but the intrinsic takes a const long long*.
+        const long long int* p8 = bit_pun<const long long int*>(p);
+
+        Half_I64 zero = { 0, 0, 0, 0},
+                 mask = {-1,-1,-1,-1};
+
+        ix *= 6;
+        Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
+                 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
+
+        #if defined(__clang__)
+            Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
+                     hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
+        #elif defined(__GNUC__)
+            Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
+                     hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
+        #endif
+        store((char*)v +  0, lo);
+        store((char*)v + 32, hi);
+    #elif N == 16
+        (void)load_48_64;
+        const long long int* p8 = bit_pun<const long long int*>(p);
+        __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
+                hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
+        store((char*)v +  0, lo);
+        store((char*)v + 64, hi);
+    #endif
+
+        *v >>= 16;
+    }
+#endif
+
+SI F F_from_U8(U8 v) {
+    return cast<F>(v) * (1/255.0f);
+}
+
+SI F F_from_U16_BE(U16 v) {
+    // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
+    // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
+    U16 lo = (v >> 8),
+        hi = (v << 8) & 0xffff;
+    return cast<F>(lo|hi) * (1/65535.0f);
+}
+
+SI U16 U16_from_F(F v) {
+    // 65535 == inf in FP16, so promote to FP32 before converting.
+    return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
+}
+
+SI F minus_1_ulp(F v) {
+#if defined(USING_NEON_FP16)
+    return bit_pun<F>( bit_pun<U16>(v) - 1 );
+#else
+    return bit_pun<F>( bit_pun<U32>(v) - 1 );
+#endif
+}
+
+SI F table(const skcms_Curve* curve, F v) {
+    // Clamp the input to [0,1], then scale to a table index.
+    F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
+
+    // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
+    I32 lo = cast<I32>(            ix      ),
+        hi = cast<I32>(minus_1_ulp(ix+1.0f));
+    F t = ix - cast<F>(lo);  // i.e. the fractional part of ix.
+
+    // TODO: can we load l and h simultaneously?  Each entry in 'h' is either
+    // the same as in 'l' or adjacent.  We have a rough idea that's it'd always be safe
+    // to read adjacent entries and perhaps underflow the table by a byte or two
+    // (it'd be junk, but always safe to read).  Not sure how to lerp yet.
+    F l,h;
+    if (curve->table_8) {
+        l = F_from_U8(gather_8(curve->table_8, lo));
+        h = F_from_U8(gather_8(curve->table_8, hi));
+    } else {
+        l = F_from_U16_BE(gather_16(curve->table_16, lo));
+        h = F_from_U16_BE(gather_16(curve->table_16, hi));
+    }
+    return l + (h-l)*t;
+}
+
+SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
+    U32 rgb = gather_24(grid_8, ix);
+
+    *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f);
+    *g = cast<F>((rgb >>  8) & 0xff) * (1/255.0f);
+    *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
+}
+
+SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
+    // TODO: don't forget to optimize gather_32().
+    U32 rgba = gather_32(grid_8, ix);
+
+    *r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+    *g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+    *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+    *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+}
+
+SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
+#if defined(__arm__)
+    // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
+    *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
+    *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
+    *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
+#else
+    // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
+    U64 rgb;
+    gather_48(grid_16, ix, &rgb);
+    rgb = swap_endian_16x4(rgb);
+
+    *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f);
+    *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
+    *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
+#endif
+}
+
+SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
+    // TODO: gather_64()-based fast path?
+    *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
+    *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
+    *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
+    *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
+}
+
+static void clut(uint32_t input_channels, uint32_t output_channels,
+                 const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
+                 F* r, F* g, F* b, F* a) {
+
+    const int dim = (int)input_channels;
+    assert (0 < dim && dim <= 4);
+    assert (output_channels == 3 ||
+            output_channels == 4);
+
+    // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
+    I32 index [8];  // Index contribution by dimension, first low from 0, then high from 4.
+    F   weight[8];  // Weight for each contribution, again first low, then high.
+
+    // O(dim) work first: calculate index,weight from r,g,b,a.
+    const F inputs[] = { *r,*g,*b,*a };
+    for (int i = dim-1, stride = 1; i >= 0; i--) {
+        // x is where we logically want to sample the grid in the i-th dimension.
+        F x = inputs[i] * (float)(grid_points[i] - 1);
+
+        // But we can't index at floats.  lo and hi are the two integer grid points surrounding x.
+        I32 lo = cast<I32>(            x      ),   // i.e. trunc(x) == floor(x) here.
+            hi = cast<I32>(minus_1_ulp(x+1.0f));
+        // Notice how we fold in the accumulated stride across previous dimensions here.
+        index[i+0] = lo * stride;
+        index[i+4] = hi * stride;
+        stride *= grid_points[i];
+
+        // We'll interpolate between those two integer grid points by t.
+        F t = x - cast<F>(lo);  // i.e. fract(x)
+        weight[i+0] = 1-t;
+        weight[i+4] = t;
+    }
+
+    *r = *g = *b = F0;
+    if (output_channels == 4) {
+        *a = F0;
+    }
+
+    // We'll sample 2^dim == 1<<dim table entries per pixel,
+    // in all combinations of low and high in each dimension.
+    for (int combo = 0; combo < (1<<dim); combo++) {  // This loop can be done in any order.
+
+        // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
+        // where 0 selects the low index contribution and its weight 1-t,
+        // or 4 the high index contribution and its weight t.
+
+        // Since 0<dim≤4, we can always just start off with the 0-th channel,
+        // then handle the others conditionally.
+        I32 ix = index [0 + (combo&1)*4];
+        F    w = weight[0 + (combo&1)*4];
+
+        switch ((dim-1)&3) {  // This lets the compiler know there are no other cases to handle.
+            case 3: ix += index [3 + (combo&8)/2];
+                    w  *= weight[3 + (combo&8)/2];
+                    FALLTHROUGH;
+                    // fall through
+
+            case 2: ix += index [2 + (combo&4)*1];
+                    w  *= weight[2 + (combo&4)*1];
+                    FALLTHROUGH;
+                    // fall through
+
+            case 1: ix += index [1 + (combo&2)*2];
+                    w  *= weight[1 + (combo&2)*2];
+        }
+
+        F R,G,B,A=F0;
+        if (output_channels == 3) {
+            if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
+            else        { sample_clut_16(grid_16,ix, &R,&G,&B); }
+        } else {
+            if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
+            else        { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
+        }
+        *r += w*R;
+        *g += w*G;
+        *b += w*B;
+        *a += w*A;
+    }
+}
+
+static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
+    clut(a2b->input_channels, a2b->output_channels,
+         a2b->grid_points, a2b->grid_8, a2b->grid_16,
+         r,g,b,&a);
+}
+static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
+    clut(b2a->input_channels, b2a->output_channels,
+         b2a->grid_points, b2a->grid_8, b2a->grid_16,
+         r,g,b,a);
+}
+
+static void exec_ops(const Op* ops, const void** args,
+                     const char* src, char* dst, int i) {
+    F r = F0, g = F0, b = F0, a = F1;
+    while (true) {
+        switch (*ops++) {
+            case Op_load_a8:{
+                a = F_from_U8(load<U8>(src + 1*i));
+            } break;
+
+            case Op_load_g8:{
+                r = g = b = F_from_U8(load<U8>(src + 1*i));
+            } break;
+
+            case Op_load_4444:{
+                U16 abgr = load<U16>(src + 2*i);
+
+                r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
+                g = cast<F>((abgr >>  8) & 0xf) * (1/15.0f);
+                b = cast<F>((abgr >>  4) & 0xf) * (1/15.0f);
+                a = cast<F>((abgr >>  0) & 0xf) * (1/15.0f);
+            } break;
+
+            case Op_load_565:{
+                U16 rgb = load<U16>(src + 2*i);
+
+                r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
+                g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
+                b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
+            } break;
+
+            case Op_load_888:{
+                const uint8_t* rgb = (const uint8_t*)(src + 3*i);
+            #if defined(USING_NEON_FP16)
+                // See the explanation under USING_NEON below.  This is that doubled up.
+                uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
+                v = vld3q_lane_u8(rgb+ 0, v,  0);
+                v = vld3q_lane_u8(rgb+ 3, v,  2);
+                v = vld3q_lane_u8(rgb+ 6, v,  4);
+                v = vld3q_lane_u8(rgb+ 9, v,  6);
+
+                v = vld3q_lane_u8(rgb+12, v,  8);
+                v = vld3q_lane_u8(rgb+15, v, 10);
+                v = vld3q_lane_u8(rgb+18, v, 12);
+                v = vld3q_lane_u8(rgb+21, v, 14);
+
+                r = cast<F>((U16)v.val[0]) * (1/255.0f);
+                g = cast<F>((U16)v.val[1]) * (1/255.0f);
+                b = cast<F>((U16)v.val[2]) * (1/255.0f);
+            #elif defined(USING_NEON)
+                // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
+                // a time.  Since we're doing that, we might as well load them into 16-bit lanes.
+                // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
+                uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
+                v = vld3_lane_u8(rgb+0, v, 0);
+                v = vld3_lane_u8(rgb+3, v, 2);
+                v = vld3_lane_u8(rgb+6, v, 4);
+                v = vld3_lane_u8(rgb+9, v, 6);
+
+                // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
+                // convert to F.  (Again, U32 would be even better here if drop ARMv7 or split
+                // ARMv7 and ARMv8 impls.)
+                r = cast<F>((U16)v.val[0]) * (1/255.0f);
+                g = cast<F>((U16)v.val[1]) * (1/255.0f);
+                b = cast<F>((U16)v.val[2]) * (1/255.0f);
+            #else
+                r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
+                g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
+                b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
+            #endif
+            } break;
+
+            case Op_load_8888:{
+                U32 rgba = load<U32>(src + 4*i);
+
+                r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+                g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+                b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+                a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+            } break;
+
+            case Op_load_8888_palette8:{
+                const uint8_t* palette = (const uint8_t*) *args++;
+                I32 ix = cast<I32>(load<U8>(src + 1*i));
+                U32 rgba = gather_32(palette, ix);
+
+                r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+                g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+                b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+                a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+            } break;
+
+            case Op_load_1010102:{
+                U32 rgba = load<U32>(src + 4*i);
+
+                r = cast<F>((rgba >>  0) & 0x3ff) * (1/1023.0f);
+                g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
+                b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
+                a = cast<F>((rgba >> 30) & 0x3  ) * (1/   3.0f);
+            } break;
+
+            case Op_load_101010x_XR:{
+                static constexpr float min = -0.752941f;
+                static constexpr float max = 1.25098f;
+                static constexpr float range = max - min;
+                U32 rgba = load<U32>(src + 4*i);
+                r = cast<F>((rgba >>  0) & 0x3ff) * (1/1023.0f) * range + min;
+                g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
+                b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
+            } break;
+
+            case Op_load_161616LE:{
+                uintptr_t ptr = (uintptr_t)(src + 6*i);
+                assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
+                const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = vld3q_u16(rgb);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = vld3_u16(rgb);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+            #else
+                r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
+                g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
+                b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
+            #endif
+            } break;
+
+            case Op_load_16161616LE:{
+                uintptr_t ptr = (uintptr_t)(src + 8*i);
+                assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
+                const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = vld4q_u16(rgba);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+                a = cast<F>((U16)v.val[3]) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = vld4_u16(rgba);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+                a = cast<F>((U16)v.val[3]) * (1/65535.0f);
+            #else
+                U64 px = load<U64>(rgba);
+
+                r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
+                g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
+                b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
+                a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
+            #endif
+            } break;
+
+            case Op_load_161616BE:{
+                uintptr_t ptr = (uintptr_t)(src + 6*i);
+                assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
+                const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = vld3q_u16(rgb);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = vld3_u16(rgb);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+            #else
+                U32 R = load_3<U32>(rgb+0),
+                    G = load_3<U32>(rgb+1),
+                    B = load_3<U32>(rgb+2);
+                // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
+                r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
+                g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
+                b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
+            #endif
+            } break;
+
+            case Op_load_16161616BE:{
+                uintptr_t ptr = (uintptr_t)(src + 8*i);
+                assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
+                const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = vld4q_u16(rgba);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+                a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = vld4_u16(rgba);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+                a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
+            #else
+                U64 px = swap_endian_16x4(load<U64>(rgba));
+
+                r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
+                g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
+                b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
+                a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
+            #endif
+            } break;
+
+            case Op_load_hhh:{
+                uintptr_t ptr = (uintptr_t)(src + 6*i);
+                assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
+                const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = vld3q_u16(rgb);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2];
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = vld3_u16(rgb);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2];
+            #else
+                U16 R = load_3<U16>(rgb+0),
+                    G = load_3<U16>(rgb+1),
+                    B = load_3<U16>(rgb+2);
+            #endif
+                r = F_from_Half(R);
+                g = F_from_Half(G);
+                b = F_from_Half(B);
+            } break;
+
+            case Op_load_hhhh:{
+                uintptr_t ptr = (uintptr_t)(src + 8*i);
+                assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
+                const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = vld4q_u16(rgba);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2],
+                    A = (U16)v.val[3];
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = vld4_u16(rgba);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2],
+                    A = (U16)v.val[3];
+            #else
+                U64 px = load<U64>(rgba);
+                U16 R = cast<U16>((px >>  0) & 0xffff),
+                    G = cast<U16>((px >> 16) & 0xffff),
+                    B = cast<U16>((px >> 32) & 0xffff),
+                    A = cast<U16>((px >> 48) & 0xffff);
+            #endif
+                r = F_from_Half(R);
+                g = F_from_Half(G);
+                b = F_from_Half(B);
+                a = F_from_Half(A);
+            } break;
+
+            case Op_load_fff:{
+                uintptr_t ptr = (uintptr_t)(src + 12*i);
+                assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
+                const float* rgb = (const float*)ptr;       // cast to const float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x3_t lo = vld3q_f32(rgb +  0),
+                              hi = vld3q_f32(rgb + 12);
+                r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
+                g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
+                b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
+            #elif defined(USING_NEON)
+                float32x4x3_t v = vld3q_f32(rgb);
+                r = (F)v.val[0];
+                g = (F)v.val[1];
+                b = (F)v.val[2];
+            #else
+                r = load_3<F>(rgb+0);
+                g = load_3<F>(rgb+1);
+                b = load_3<F>(rgb+2);
+            #endif
+            } break;
+
+            case Op_load_ffff:{
+                uintptr_t ptr = (uintptr_t)(src + 16*i);
+                assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
+                const float* rgba = (const float*)ptr;      // cast to const float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x4_t lo = vld4q_f32(rgba +  0),
+                              hi = vld4q_f32(rgba + 16);
+                r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
+                g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
+                b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
+                a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
+            #elif defined(USING_NEON)
+                float32x4x4_t v = vld4q_f32(rgba);
+                r = (F)v.val[0];
+                g = (F)v.val[1];
+                b = (F)v.val[2];
+                a = (F)v.val[3];
+            #else
+                r = load_4<F>(rgba+0);
+                g = load_4<F>(rgba+1);
+                b = load_4<F>(rgba+2);
+                a = load_4<F>(rgba+3);
+            #endif
+            } break;
+
+            case Op_swap_rb:{
+                F t = r;
+                r = b;
+                b = t;
+            } break;
+
+            case Op_clamp:{
+                r = max_(F0, min_(r, F1));
+                g = max_(F0, min_(g, F1));
+                b = max_(F0, min_(b, F1));
+                a = max_(F0, min_(a, F1));
+            } break;
+
+            case Op_invert:{
+                r = F1 - r;
+                g = F1 - g;
+                b = F1 - b;
+                a = F1 - a;
+            } break;
+
+            case Op_force_opaque:{
+                a = F1;
+            } break;
+
+            case Op_premul:{
+                r *= a;
+                g *= a;
+                b *= a;
+            } break;
+
+            case Op_unpremul:{
+                F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
+                r *= scale;
+                g *= scale;
+                b *= scale;
+            } break;
+
+            case Op_matrix_3x3:{
+                const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
+                const float* m = &matrix->vals[0][0];
+
+                F R = m[0]*r + m[1]*g + m[2]*b,
+                  G = m[3]*r + m[4]*g + m[5]*b,
+                  B = m[6]*r + m[7]*g + m[8]*b;
+
+                r = R;
+                g = G;
+                b = B;
+            } break;
+
+            case Op_matrix_3x4:{
+                const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
+                const float* m = &matrix->vals[0][0];
+
+                F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
+                  G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
+                  B = m[8]*r + m[9]*g + m[10]*b + m[11];
+
+                r = R;
+                g = G;
+                b = B;
+            } break;
+
+            case Op_lab_to_xyz:{
+                // The L*a*b values are in r,g,b, but normalized to [0,1].  Reconstruct them:
+                F L = r * 100.0f,
+                  A = g * 255.0f - 128.0f,
+                  B = b * 255.0f - 128.0f;
+
+                // Convert to CIE XYZ.
+                F Y = (L + 16.0f) * (1/116.0f),
+                  X = Y + A*(1/500.0f),
+                  Z = Y - B*(1/200.0f);
+
+                X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
+                Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
+                Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
+
+                // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
+                r = X * 0.9642f;
+                g = Y          ;
+                b = Z * 0.8249f;
+            } break;
+
+            // As above, in reverse.
+            case Op_xyz_to_lab:{
+                F X = r * (1/0.9642f),
+                  Y = g,
+                  Z = b * (1/0.8249f);
+
+                X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
+                Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
+                Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
+
+                F L = Y*116.0f - 16.0f,
+                  A = (X-Y)*500.0f,
+                  B = (Y-Z)*200.0f;
+
+                r = L * (1/100.f);
+                g = (A + 128.0f) * (1/255.0f);
+                b = (B + 128.0f) * (1/255.0f);
+            } break;
+
+            case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
+
+            case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
+
+            case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
+
+            case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
+
+            case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
+            case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
+            case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
+            case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
+
+            case Op_clut_A2B: {
+                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
+                clut(a2b, &r,&g,&b,a);
+
+                if (a2b->input_channels == 4) {
+                    // CMYK is opaque.
+                    a = F1;
+                }
+            } break;
+
+            case Op_clut_B2A: {
+                const skcms_B2A* b2a = (const skcms_B2A*) *args++;
+                clut(b2a, &r,&g,&b,&a);
+            } break;
+
+    // Notice, from here on down the store_ ops all return, ending the loop.
+
+            case Op_store_a8: {
+                store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
+            } return;
+
+            case Op_store_g8: {
+                // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
+                store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
+            } return;
+
+            case Op_store_4444: {
+                store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
+                                    | cast<U16>(to_fixed(g * 15) <<  8)
+                                    | cast<U16>(to_fixed(b * 15) <<  4)
+                                    | cast<U16>(to_fixed(a * 15) <<  0));
+            } return;
+
+            case Op_store_565: {
+                store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) <<  0 )
+                                    | cast<U16>(to_fixed(g * 63) <<  5 )
+                                    | cast<U16>(to_fixed(b * 31) << 11 ));
+            } return;
+
+            case Op_store_888: {
+                uint8_t* rgb = (uint8_t*)dst + 3*i;
+            #if defined(USING_NEON_FP16)
+                // See the explanation under USING_NEON below.  This is that doubled up.
+                U16 R = to_fixed(r * 255),
+                    G = to_fixed(g * 255),
+                    B = to_fixed(b * 255);
+
+                uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
+                vst3q_lane_u8(rgb+ 0, v,  0);
+                vst3q_lane_u8(rgb+ 3, v,  2);
+                vst3q_lane_u8(rgb+ 6, v,  4);
+                vst3q_lane_u8(rgb+ 9, v,  6);
+
+                vst3q_lane_u8(rgb+12, v,  8);
+                vst3q_lane_u8(rgb+15, v, 10);
+                vst3q_lane_u8(rgb+18, v, 12);
+                vst3q_lane_u8(rgb+21, v, 14);
+            #elif defined(USING_NEON)
+                // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
+                // get there via U16 to save some instructions converting to float.  And just
+                // like load_888, we'd prefer to go via U32 but for ARMv7 support.
+                U16 R = cast<U16>(to_fixed(r * 255)),
+                    G = cast<U16>(to_fixed(g * 255)),
+                    B = cast<U16>(to_fixed(b * 255));
+
+                uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
+                vst3_lane_u8(rgb+0, v, 0);
+                vst3_lane_u8(rgb+3, v, 2);
+                vst3_lane_u8(rgb+6, v, 4);
+                vst3_lane_u8(rgb+9, v, 6);
+            #else
+                store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
+                store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
+                store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
+            #endif
+            } return;
+
+            case Op_store_8888: {
+                store(dst + 4*i, cast<U32>(to_fixed(r * 255)) <<  0
+                               | cast<U32>(to_fixed(g * 255)) <<  8
+                               | cast<U32>(to_fixed(b * 255)) << 16
+                               | cast<U32>(to_fixed(a * 255)) << 24);
+            } return;
+
+            case Op_store_101010x_XR: {
+                static constexpr float min = -0.752941f;
+                static constexpr float max = 1.25098f;
+                static constexpr float range = max - min;
+                store(dst + 4*i, cast<U32>(to_fixed(((r - min) / range) * 1023)) <<  0
+                               | cast<U32>(to_fixed(((g - min) / range) * 1023)) << 10
+                               | cast<U32>(to_fixed(((b - min) / range) * 1023)) << 20);
+                return;
+            }
+            case Op_store_1010102: {
+                store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) <<  0
+                               | cast<U32>(to_fixed(g * 1023)) << 10
+                               | cast<U32>(to_fixed(b * 1023)) << 20
+                               | cast<U32>(to_fixed(a *    3)) << 30);
+            } return;
+
+            case Op_store_161616LE: {
+                uintptr_t ptr = (uintptr_t)(dst + 6*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = {{
+                    (uint16x8_t)U16_from_F(r),
+                    (uint16x8_t)U16_from_F(g),
+                    (uint16x8_t)U16_from_F(b),
+                }};
+                vst3q_u16(rgb, v);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = {{
+                    (uint16x4_t)U16_from_F(r),
+                    (uint16x4_t)U16_from_F(g),
+                    (uint16x4_t)U16_from_F(b),
+                }};
+                vst3_u16(rgb, v);
+            #else
+                store_3(rgb+0, U16_from_F(r));
+                store_3(rgb+1, U16_from_F(g));
+                store_3(rgb+2, U16_from_F(b));
+            #endif
+
+            } return;
+
+            case Op_store_16161616LE: {
+                uintptr_t ptr = (uintptr_t)(dst + 8*i);
+                assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
+                uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = {{
+                    (uint16x8_t)U16_from_F(r),
+                    (uint16x8_t)U16_from_F(g),
+                    (uint16x8_t)U16_from_F(b),
+                    (uint16x8_t)U16_from_F(a),
+                }};
+                vst4q_u16(rgba, v);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = {{
+                    (uint16x4_t)U16_from_F(r),
+                    (uint16x4_t)U16_from_F(g),
+                    (uint16x4_t)U16_from_F(b),
+                    (uint16x4_t)U16_from_F(a),
+                }};
+                vst4_u16(rgba, v);
+            #else
+                U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
+                       | cast<U64>(to_fixed(g * 65535)) << 16
+                       | cast<U64>(to_fixed(b * 65535)) << 32
+                       | cast<U64>(to_fixed(a * 65535)) << 48;
+                store(rgba, px);
+            #endif
+            } return;
+
+            case Op_store_161616BE: {
+                uintptr_t ptr = (uintptr_t)(dst + 6*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = {{
+                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
+                }};
+                vst3q_u16(rgb, v);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = {{
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
+                }};
+                vst3_u16(rgb, v);
+            #else
+                U32 R = to_fixed(r * 65535),
+                    G = to_fixed(g * 65535),
+                    B = to_fixed(b * 65535);
+                store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
+                store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
+                store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
+            #endif
+
+            } return;
+
+            case Op_store_16161616BE: {
+                uintptr_t ptr = (uintptr_t)(dst + 8*i);
+                assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
+                uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = {{
+                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(a)),
+                }};
+                vst4q_u16(rgba, v);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = {{
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
+                }};
+                vst4_u16(rgba, v);
+            #else
+                U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
+                       | cast<U64>(to_fixed(g * 65535)) << 16
+                       | cast<U64>(to_fixed(b * 65535)) << 32
+                       | cast<U64>(to_fixed(a * 65535)) << 48;
+                store(rgba, swap_endian_16x4(px));
+            #endif
+            } return;
+
+            case Op_store_hhh: {
+                uintptr_t ptr = (uintptr_t)(dst + 6*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
+
+                U16 R = Half_from_F(r),
+                    G = Half_from_F(g),
+                    B = Half_from_F(b);
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = {{
+                    (uint16x8_t)R,
+                    (uint16x8_t)G,
+                    (uint16x8_t)B,
+                }};
+                vst3q_u16(rgb, v);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = {{
+                    (uint16x4_t)R,
+                    (uint16x4_t)G,
+                    (uint16x4_t)B,
+                }};
+                vst3_u16(rgb, v);
+            #else
+                store_3(rgb+0, R);
+                store_3(rgb+1, G);
+                store_3(rgb+2, B);
+            #endif
+            } return;
+
+            case Op_store_hhhh: {
+                uintptr_t ptr = (uintptr_t)(dst + 8*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgba = (uint16_t*)ptr;         // for this cast to uint16_t* to be safe.
+
+                U16 R = Half_from_F(r),
+                    G = Half_from_F(g),
+                    B = Half_from_F(b),
+                    A = Half_from_F(a);
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = {{
+                    (uint16x8_t)R,
+                    (uint16x8_t)G,
+                    (uint16x8_t)B,
+                    (uint16x8_t)A,
+                }};
+                vst4q_u16(rgba, v);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = {{
+                    (uint16x4_t)R,
+                    (uint16x4_t)G,
+                    (uint16x4_t)B,
+                    (uint16x4_t)A,
+                }};
+                vst4_u16(rgba, v);
+            #else
+                store(rgba, cast<U64>(R) <<  0
+                          | cast<U64>(G) << 16
+                          | cast<U64>(B) << 32
+                          | cast<U64>(A) << 48);
+            #endif
+
+            } return;
+
+            case Op_store_fff: {
+                uintptr_t ptr = (uintptr_t)(dst + 12*i);
+                assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
+                float* rgb = (float*)ptr;                // for this cast to float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x3_t lo = {{
+                    vcvt_f32_f16(vget_low_f16(r)),
+                    vcvt_f32_f16(vget_low_f16(g)),
+                    vcvt_f32_f16(vget_low_f16(b)),
+                }}, hi = {{
+                    vcvt_f32_f16(vget_high_f16(r)),
+                    vcvt_f32_f16(vget_high_f16(g)),
+                    vcvt_f32_f16(vget_high_f16(b)),
+                }};
+                vst3q_f32(rgb +  0, lo);
+                vst3q_f32(rgb + 12, hi);
+            #elif defined(USING_NEON)
+                float32x4x3_t v = {{
+                    (float32x4_t)r,
+                    (float32x4_t)g,
+                    (float32x4_t)b,
+                }};
+                vst3q_f32(rgb, v);
+            #else
+                store_3(rgb+0, r);
+                store_3(rgb+1, g);
+                store_3(rgb+2, b);
+            #endif
+            } return;
+
+            case Op_store_ffff: {
+                uintptr_t ptr = (uintptr_t)(dst + 16*i);
+                assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
+                float* rgba = (float*)ptr;               // for this cast to float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x4_t lo = {{
+                    vcvt_f32_f16(vget_low_f16(r)),
+                    vcvt_f32_f16(vget_low_f16(g)),
+                    vcvt_f32_f16(vget_low_f16(b)),
+                    vcvt_f32_f16(vget_low_f16(a)),
+                }}, hi = {{
+                    vcvt_f32_f16(vget_high_f16(r)),
+                    vcvt_f32_f16(vget_high_f16(g)),
+                    vcvt_f32_f16(vget_high_f16(b)),
+                    vcvt_f32_f16(vget_high_f16(a)),
+                }};
+                vst4q_f32(rgba +  0, lo);
+                vst4q_f32(rgba + 16, hi);
+            #elif defined(USING_NEON)
+                float32x4x4_t v = {{
+                    (float32x4_t)r,
+                    (float32x4_t)g,
+                    (float32x4_t)b,
+                    (float32x4_t)a,
+                }};
+                vst4q_f32(rgba, v);
+            #else
+                store_4(rgba+0, r);
+                store_4(rgba+1, g);
+                store_4(rgba+2, b);
+                store_4(rgba+3, a);
+            #endif
+            } return;
+        }
+    }
+}
+
+
+static void run_program(const Op* program, const void** arguments,
+                        const char* src, char* dst, int n,
+                        const size_t src_bpp, const size_t dst_bpp) {
+    int i = 0;
+    while (n >= N) {
+        exec_ops(program, arguments, src, dst, i);
+        i += N;
+        n -= N;
+    }
+    if (n > 0) {
+        char tmp[4*4*N] = {0};
+
+        memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
+        exec_ops(program, arguments, tmp, tmp, 0);
+        memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
+    }
+}
+
+// Clean up any #defines we may have set so that we can be #included again.
+#if defined(USING_AVX)
+    #undef  USING_AVX
+#endif
+#if defined(USING_AVX_F16C)
+    #undef  USING_AVX_F16C
+#endif
+#if defined(USING_AVX2)
+    #undef  USING_AVX2
+#endif
+#if defined(USING_AVX512F)
+    #undef  USING_AVX512F
+#endif
+
+#if defined(USING_NEON)
+    #undef  USING_NEON
+#endif
+#if defined(USING_NEON_F16C)
+    #undef  USING_NEON_F16C
+#endif
+#if defined(USING_NEON_FP16)
+    #undef  USING_NEON_FP16
+#endif
+
+#undef FALLTHROUGH
diff --git a/gfx/skia/skia/modules/skcms/version.sha1 b/gfx/skia/skia/modules/skcms/version.sha1
new file mode 100755
index 0000000000..ae57df7d06
--- /dev/null
+++ b/gfx/skia/skia/modules/skcms/version.sha1
@@ -0,0 +1 @@
+ba39d81f9797aa973bdf01aa6b0363b280352fba
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
commit	36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree	105e8c98ddea1c1e4784a60a5a6410fa416be2de /gfx/skia/skia/modules/skcms
parent	Initial commit. (diff)
download	firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip