From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 7 Apr 2024 21:33:14 +0200
Subject: Adding upstream version 115.7.0esr.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 gfx/qcms/src/c_bindings.rs     |  507 ++++++++++++
 gfx/qcms/src/chain.rs          | 1029 ++++++++++++++++++++++++
 gfx/qcms/src/gtest.rs          |  962 ++++++++++++++++++++++
 gfx/qcms/src/iccread.rs        | 1718 ++++++++++++++++++++++++++++++++++++++++
 gfx/qcms/src/lib.rs            |   72 ++
 gfx/qcms/src/matrix.rs         |  134 ++++
 gfx/qcms/src/transform.rs      | 1571 ++++++++++++++++++++++++++++++++++++
 gfx/qcms/src/transform_avx.rs  |  230 ++++++
 gfx/qcms/src/transform_neon.rs |  158 ++++
 gfx/qcms/src/transform_sse2.rs |  159 ++++
 gfx/qcms/src/transform_util.rs |  608 ++++++++++++++
 11 files changed, 7148 insertions(+)
 create mode 100644 gfx/qcms/src/c_bindings.rs
 create mode 100644 gfx/qcms/src/chain.rs
 create mode 100644 gfx/qcms/src/gtest.rs
 create mode 100644 gfx/qcms/src/iccread.rs
 create mode 100644 gfx/qcms/src/lib.rs
 create mode 100644 gfx/qcms/src/matrix.rs
 create mode 100644 gfx/qcms/src/transform.rs
 create mode 100644 gfx/qcms/src/transform_avx.rs
 create mode 100644 gfx/qcms/src/transform_neon.rs
 create mode 100644 gfx/qcms/src/transform_sse2.rs
 create mode 100644 gfx/qcms/src/transform_util.rs

(limited to 'gfx/qcms/src')

diff --git a/gfx/qcms/src/c_bindings.rs b/gfx/qcms/src/c_bindings.rs
new file mode 100644
index 0000000000..efe6674fe5
--- /dev/null
+++ b/gfx/qcms/src/c_bindings.rs
@@ -0,0 +1,507 @@
+#![allow(clippy::missing_safety_doc)]
+
+use std::{ptr::null_mut, slice};
+
+use libc::{fclose, fopen, fread, free, malloc, memset, FILE};
+
+use crate::{
+    double_to_s15Fixed16Number,
+    iccread::*,
+    s15Fixed16Number_to_float,
+    transform::get_rgb_colorants,
+    transform::DataType,
+    transform::{qcms_transform, transform_create},
+    transform_util,
+    Intent,
+};
+
+#[no_mangle]
+pub extern "C" fn qcms_profile_sRGB() -> *mut Profile {
+    let profile = Profile::new_sRGB();
+    Box::into_raw(profile)
+}
+
+//XXX: it would be nice if we had a way of ensuring
+// everything in a profile was initialized regardless of how it was created
+//XXX: should this also be taking a black_point?
+/* similar to CGColorSpaceCreateCalibratedRGB */
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_create_rgb_with_gamma_set(
+    white_point: qcms_CIE_xyY,
+    primaries: qcms_CIE_xyYTRIPLE,
+    redGamma: f32,
+    greenGamma: f32,
+    blueGamma: f32,
+) -> *mut Profile {
+    let profile =
+        Profile::new_rgb_with_gamma_set(white_point, primaries, redGamma, greenGamma, blueGamma);
+    profile.map_or_else(null_mut, Box::into_raw)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_create_gray_with_gamma(gamma: f32) -> *mut Profile {
+    let profile = Profile::new_gray_with_gamma(gamma);
+    Box::into_raw(profile)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_create_rgb_with_gamma(
+    white_point: qcms_CIE_xyY,
+    primaries: qcms_CIE_xyYTRIPLE,
+    gamma: f32,
+) -> *mut Profile {
+    qcms_profile_create_rgb_with_gamma_set(white_point, primaries, gamma, gamma, gamma)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_create_rgb_with_table(
+    white_point: qcms_CIE_xyY,
+    primaries: qcms_CIE_xyYTRIPLE,
+    table: *const u16,
+    num_entries: i32,
+) -> *mut Profile {
+    let table = slice::from_raw_parts(table, num_entries as usize);
+    let profile = Profile::new_rgb_with_table(white_point, primaries, table);
+    profile.map_or_else(null_mut, Box::into_raw)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_create_cicp(
+    colour_primaries: u8,
+    transfer_characteristics: u8,
+) -> *mut Profile {
+    Profile::new_cicp(colour_primaries.into(), transfer_characteristics.into())
+        .map_or_else(null_mut, Box::into_raw)
+}
+
+/* qcms_profile_from_memory does not hold a reference to the memory passed in */
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_from_memory(
+    mem: *const libc::c_void,
+    size: usize,
+) -> *mut Profile {
+    let mem = slice::from_raw_parts(mem as *const libc::c_uchar, size);
+    let profile = Profile::new_from_slice(mem, false);
+    profile.map_or_else(null_mut, Box::into_raw)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_from_memory_curves_only(
+    mem: *const libc::c_void,
+    size: usize,
+) -> *mut Profile {
+    let mem = slice::from_raw_parts(mem as *const libc::c_uchar, size);
+    let profile = Profile::new_from_slice(mem, true);
+    profile.map_or_else(null_mut, Box::into_raw)
+}
+
+
+#[no_mangle]
+pub extern "C" fn qcms_profile_get_rendering_intent(profile: &Profile) -> Intent {
+    profile.rendering_intent
+}
+#[no_mangle]
+pub extern "C" fn qcms_profile_get_color_space(profile: &Profile) -> icColorSpaceSignature {
+    profile.color_space
+}
+#[no_mangle]
+pub extern "C" fn qcms_profile_is_sRGB(profile: &Profile) -> bool {
+    profile.is_sRGB()
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_release(profile: *mut Profile) {
+    drop(Box::from_raw(profile));
+}
+unsafe extern "C" fn qcms_data_from_file(
+    file: *mut FILE,
+    mem: *mut *mut libc::c_void,
+    size: *mut usize,
+) {
+    let length: u32;
+    let remaining_length: u32;
+    let read_length: usize;
+    let mut length_be: u32 = 0;
+    let data: *mut libc::c_void;
+    *mem = std::ptr::null_mut::<libc::c_void>();
+    *size = 0;
+    if fread(
+        &mut length_be as *mut u32 as *mut libc::c_void,
+        1,
+        ::std::mem::size_of::<u32>(),
+        file,
+    ) != ::std::mem::size_of::<u32>()
+    {
+        return;
+    }
+    length = u32::from_be(length_be);
+    if length > MAX_PROFILE_SIZE as libc::c_uint
+        || (length as libc::c_ulong) < ::std::mem::size_of::<u32>() as libc::c_ulong
+    {
+        return;
+    }
+    /* allocate room for the entire profile */
+    data = malloc(length as usize);
+    if data.is_null() {
+        return;
+    }
+    /* copy in length to the front so that the buffer will contain the entire profile */
+    *(data as *mut u32) = length_be;
+    remaining_length =
+        (length as libc::c_ulong - ::std::mem::size_of::<u32>() as libc::c_ulong) as u32;
+    /* read the rest profile */
+    read_length = fread(
+        (data as *mut libc::c_uchar).add(::std::mem::size_of::<u32>()) as *mut libc::c_void,
+        1,
+        remaining_length as usize,
+        file,
+    ) as usize;
+    if read_length != remaining_length as usize {
+        free(data);
+        return;
+    }
+    /* successfully get the profile.*/
+    *mem = data;
+    *size = length as usize;
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_from_file(file: *mut FILE) -> *mut Profile {
+    let mut length: usize = 0;
+    let profile: *mut Profile;
+    let mut data: *mut libc::c_void = std::ptr::null_mut::<libc::c_void>();
+    qcms_data_from_file(file, &mut data, &mut length);
+    if data.is_null() || length == 0 {
+        return std::ptr::null_mut::<Profile>();
+    }
+    profile = qcms_profile_from_memory(data, length);
+    free(data);
+    profile
+}
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_from_path(path: *const libc::c_char) -> *mut Profile {
+    if let Ok(Some(boxed_profile)) = std::ffi::CStr::from_ptr(path)
+        .to_str()
+        .map(Profile::new_from_path)
+    {
+        Box::into_raw(boxed_profile)
+    } else {
+        std::ptr::null_mut()
+    }
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_data_from_path(
+    path: *const libc::c_char,
+    mem: *mut *mut libc::c_void,
+    size: *mut usize,
+) {
+    *mem = std::ptr::null_mut::<libc::c_void>();
+    *size = 0;
+    let file = fopen(path, b"rb\x00" as *const u8 as *const libc::c_char);
+    if !file.is_null() {
+        qcms_data_from_file(file, mem, size);
+        fclose(file);
+    };
+}
+
+#[cfg(windows)]
+extern "C" {
+    pub fn _wfopen(filename: *const libc::wchar_t, mode: *const libc::wchar_t) -> *mut FILE;
+}
+
+#[cfg(windows)]
+#[no_mangle]
+pub unsafe extern "C" fn qcms_profile_from_unicode_path(path: *const libc::wchar_t) {
+    let file = _wfopen(path, ['r' as u16, 'b' as u16, '\0' as u16].as_ptr());
+    if !file.is_null() {
+        qcms_profile_from_file(file);
+        fclose(file);
+    };
+}
+
+#[cfg(windows)]
+#[no_mangle]
+pub unsafe extern "C" fn qcms_data_from_unicode_path(
+    path: *const libc::wchar_t,
+    mem: *mut *mut libc::c_void,
+    size: *mut usize,
+) {
+    *mem = 0 as *mut libc::c_void;
+    *size = 0;
+    let file = _wfopen(path, ['r' as u16, 'b' as u16, '\0' as u16].as_ptr());
+    if !file.is_null() {
+        qcms_data_from_file(file, mem, size);
+        fclose(file);
+    };
+}
+
+#[no_mangle]
+pub extern "C" fn qcms_transform_create(
+    in_0: &Profile,
+    in_type: DataType,
+    out: &Profile,
+    out_type: DataType,
+    intent: Intent,
+) -> *mut qcms_transform {
+    let transform = transform_create(in_0, in_type, out, out_type, intent);
+    match transform {
+        Some(transform) => Box::into_raw(transform),
+        None => null_mut(),
+    }
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_data_create_rgb_with_gamma(
+    white_point: qcms_CIE_xyY,
+    primaries: qcms_CIE_xyYTRIPLE,
+    gamma: f32,
+    mem: *mut *mut libc::c_void,
+    size: *mut usize,
+) {
+    let length: u32;
+    let mut index: u32;
+    let xyz_count: u32;
+    let trc_count: u32;
+    let mut tag_table_offset: usize;
+    let mut tag_data_offset: usize;
+    let data: *mut libc::c_void;
+
+    let TAG_XYZ: [u32; 3] = [TAG_rXYZ, TAG_gXYZ, TAG_bXYZ];
+    let TAG_TRC: [u32; 3] = [TAG_rTRC, TAG_gTRC, TAG_bTRC];
+    if mem.is_null() || size.is_null() {
+        return;
+    }
+    *mem = std::ptr::null_mut::<libc::c_void>();
+    *size = 0;
+    /*
+    	* total length = icc profile header(128) + tag count(4) +
+    	* (tag table item (12) * total tag (6 = 3 rTRC + 3 rXYZ)) + rTRC elements data (3 * 20)
+    	* + rXYZ elements data (3*16), and all tag data elements must start at the 4-byte boundary.
+    	*/
+    xyz_count = 3; // rXYZ, gXYZ, bXYZ
+    trc_count = 3; // rTRC, gTRC, bTRC
+    length =
+        (128 + 4) as libc::c_uint + 12 * (xyz_count + trc_count) + xyz_count * 20 + trc_count * 16;
+    // reserve the total memory.
+    data = malloc(length as usize);
+    if data.is_null() {
+        return;
+    }
+    memset(data, 0, length as usize);
+    // Part1 : write rXYZ, gXYZ and bXYZ
+    let colorants = match get_rgb_colorants(white_point, primaries) {
+        Some(colorants) => colorants,
+        None => {
+            free(data);
+            return;
+        }
+    };
+
+    let data = std::slice::from_raw_parts_mut(data as *mut u8, length as usize);
+    // the position of first tag's signature in tag table
+    tag_table_offset = (128 + 4) as usize; // the start of tag data elements.
+    tag_data_offset = ((128 + 4) as libc::c_uint + 12 * (xyz_count + trc_count)) as usize;
+    index = 0;
+    while index < xyz_count {
+        // tag table
+        write_u32(data, tag_table_offset, TAG_XYZ[index as usize]); // 20 bytes per TAG_(r/g/b)XYZ tag element
+        write_u32(data, tag_table_offset + 4, tag_data_offset as u32);
+        write_u32(data, tag_table_offset + 8, 20);
+        // tag data element
+        write_u32(data, tag_data_offset, XYZ_TYPE);
+        // reserved 4 bytes.
+        write_u32(
+            data,
+            tag_data_offset + 8,
+            double_to_s15Fixed16Number(colorants.m[0][index as usize] as f64) as u32,
+        );
+        write_u32(
+            data,
+            tag_data_offset + 12,
+            double_to_s15Fixed16Number(colorants.m[1][index as usize] as f64) as u32,
+        );
+        write_u32(
+            data,
+            tag_data_offset + 16,
+            double_to_s15Fixed16Number(colorants.m[2][index as usize] as f64) as u32,
+        );
+        tag_table_offset += 12;
+        tag_data_offset += 20;
+        index += 1
+    }
+    // Part2 : write rTRC, gTRC and bTRC
+    index = 0;
+    while index < trc_count {
+        // tag table
+        write_u32(data, tag_table_offset, TAG_TRC[index as usize]); // 14 bytes per TAG_(r/g/b)TRC element
+        write_u32(data, tag_table_offset + 4, tag_data_offset as u32);
+        write_u32(data, tag_table_offset + 8, 14);
+        // tag data element
+        write_u32(data, tag_data_offset, CURVE_TYPE);
+        // reserved 4 bytes.
+        write_u32(data, tag_data_offset + 8, 1); // count
+        write_u16(data, tag_data_offset + 12, float_to_u8Fixed8Number(gamma));
+        tag_table_offset += 12;
+        tag_data_offset += 16;
+        index += 1
+    }
+    /* Part3 : write profile header
+     *
+     * Important header fields are left empty. This generates a profile for internal use only.
+     * We should be generating: Profile version (04300000h), Profile signature (acsp),
+     * PCS illumiant field. Likewise mandatory profile tags are omitted.
+     */
+    write_u32(data, 0, length); // the total length of this memory
+    write_u32(data, 12, DISPLAY_DEVICE_PROFILE); // profile->class_type
+    write_u32(data, 16, RGB_SIGNATURE); // profile->color_space
+    write_u32(data, 20, XYZ_TYPE); // profile->pcs
+    write_u32(data, 64, Intent::Perceptual as u32); // profile->rendering_intent
+    write_u32(data, 128, 6); // total tag count
+                             // prepare the result
+    *mem = data.as_mut_ptr() as *mut libc::c_void;
+    *size = length as usize;
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_transform_data(
+    transform: &qcms_transform,
+    src: *const libc::c_void,
+    dest: *mut libc::c_void,
+    length: usize,
+) {
+    transform.transform_fn.expect("non-null function pointer")(
+        transform,
+        src as *const u8,
+        dest as *mut u8,
+        length,
+    );
+}
+/*
+use crate::matrix;
+#[repr(C)]
+#[derive(Clone, Debug, Default)]
+pub struct qcms_mat3r3 {
+    pub rows: [[f32; 3] ; 3],
+}
+impl qcms_mat3r3 {
+    fn from(m: matrix::Matrix) -> qcms_mat3r3 {
+        qcms_mat3r3{
+            rows: [
+                m.row(0),
+                m.row(1),
+                m.row(2),
+            ],
+        }
+    }
+}
+*/
+#[repr(C)]
+#[derive(Clone, Debug, Default)]
+#[allow(clippy::upper_case_acronyms)]
+pub struct qcms_profile_data {
+    pub class_type: u32,
+    pub color_space: u32,
+    pub pcs: u32,
+    pub rendering_intent: Intent,
+    pub red_colorant_xyzd50: [f32; 3],
+    pub blue_colorant_xyzd50: [f32; 3],
+    pub green_colorant_xyzd50: [f32; 3],
+     // Number of samples in the e.g. gamma->linear LUT.
+    pub linear_from_trc_red_samples: i32,
+    pub linear_from_trc_blue_samples: i32,
+    pub linear_from_trc_green_samples: i32,
+}
+
+pub use crate::iccread::Profile as qcms_profile;
+
+#[no_mangle]
+pub extern "C" fn qcms_profile_get_data(
+    profile: &qcms_profile,
+    out_data: &mut qcms_profile_data,
+) {
+    out_data.class_type = profile.class_type;
+    out_data.color_space = profile.color_space;
+    out_data.pcs = profile.pcs;
+    out_data.rendering_intent = profile.rendering_intent;
+
+    fn colorant(c: &XYZNumber) -> [f32;3] {
+        [c.X, c.Y, c.Z].map(s15Fixed16Number_to_float)
+    }
+    out_data.red_colorant_xyzd50 = colorant(&profile.redColorant);
+    out_data.blue_colorant_xyzd50 = colorant(&profile.blueColorant);
+    out_data.green_colorant_xyzd50 = colorant(&profile.greenColorant);
+
+    fn trc_to_samples(trc: &Option<Box<curveType>>) -> i32 {
+        if let Some(ref trc) = *trc {
+            match &**trc {
+                curveType::Curve(v) => {
+                    let len = v.len();
+                    if len <= 1 {
+                        -1
+                    } else {
+                        len as i32
+                    }
+                },
+                curveType::Parametric(_) => -1,
+            }
+        } else {
+            0
+        }
+    }
+    out_data.linear_from_trc_red_samples = trc_to_samples(&profile.redTRC);
+    out_data.linear_from_trc_blue_samples = trc_to_samples(&profile.blueTRC);
+    out_data.linear_from_trc_green_samples = trc_to_samples(&profile.greenTRC);
+}
+
+#[repr(u8)]
+pub enum qcms_color_channel {
+    Red,
+    Green,
+    Blue,
+}
+
+#[no_mangle]
+pub extern "C" fn qcms_profile_get_lut(
+    profile: &qcms_profile,
+    channel: qcms_color_channel, // FYI: UB if you give Rust something out of range!
+    out_begin: *mut f32,
+    out_end: *mut f32,
+) {
+    let out_slice = unsafe {
+        std::slice::from_raw_parts_mut(out_begin, out_end.offset_from(out_begin) as usize)
+    };
+
+    let trc = match channel {
+        qcms_color_channel::Red => &profile.redTRC,
+        qcms_color_channel::Green => &profile.greenTRC,
+        qcms_color_channel::Blue => &profile.blueTRC,
+    };
+
+    let samples_u16 = if let Some(trc) = trc {
+        let trc = &*trc;
+        // Yes, sub-optimal, but easier to implement, and these aren't big or hot:
+        // 1. Ask for a new vec<u16> lut based on the trc.
+        //   * (eat the extra alloc)
+        // 2. Convert the u16s back out to f32s in our slice.
+        //   * (eat the copy and quantization error from f32->u16->f32 roundtrip)
+        transform_util::build_lut_for_linear_from_tf(trc, Some(out_slice.len()))
+    } else {
+        Vec::new()
+    };
+
+    assert_eq!(samples_u16.len(), out_slice.len());
+    for (d, s) in out_slice.iter_mut().zip(samples_u16.into_iter()) {
+        *d = (s as f32) / (u16::MAX as f32);
+    }
+}
+
+pub type icColorSpaceSignature = u32;
+pub const icSigGrayData: icColorSpaceSignature = 0x47524159; // 'GRAY'
+pub const icSigRgbData: icColorSpaceSignature = 0x52474220; // 'RGB '
+pub const icSigCmykData: icColorSpaceSignature = 0x434d594b; // 'CMYK'
+
+pub use crate::iccread::qcms_profile_is_bogus;
+pub use crate::transform::{
+    qcms_enable_iccv4, qcms_profile_precache_output_transform, qcms_transform_release,
+};
diff --git a/gfx/qcms/src/chain.rs b/gfx/qcms/src/chain.rs
new file mode 100644
index 0000000000..35a3896138
--- /dev/null
+++ b/gfx/qcms/src/chain.rs
@@ -0,0 +1,1029 @@
+//  qcms
+//  Copyright (C) 2009 Mozilla Corporation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+use crate::{
+    iccread::LAB_SIGNATURE,
+    iccread::RGB_SIGNATURE,
+    iccread::XYZ_SIGNATURE,
+    iccread::{lutType, lutmABType, Profile, CMYK_SIGNATURE},
+    matrix::Matrix,
+    s15Fixed16Number_to_float,
+    transform_util::clamp_float,
+    transform_util::{
+        build_colorant_matrix, build_input_gamma_table, build_output_lut, lut_interp_linear,
+        lut_interp_linear_float,
+    },
+};
+
+trait ModularTransform {
+    fn transform(&self, src: &[f32], dst: &mut [f32]);
+}
+
+#[inline]
+fn lerp(a: f32, b: f32, t: f32) -> f32 {
+    a * (1.0 - t) + b * t
+}
+
+fn build_lut_matrix(lut: &lutType) -> Matrix {
+    let mut result: Matrix = Matrix { m: [[0.; 3]; 3] };
+    result.m[0][0] = s15Fixed16Number_to_float(lut.e00);
+    result.m[0][1] = s15Fixed16Number_to_float(lut.e01);
+    result.m[0][2] = s15Fixed16Number_to_float(lut.e02);
+    result.m[1][0] = s15Fixed16Number_to_float(lut.e10);
+    result.m[1][1] = s15Fixed16Number_to_float(lut.e11);
+    result.m[1][2] = s15Fixed16Number_to_float(lut.e12);
+    result.m[2][0] = s15Fixed16Number_to_float(lut.e20);
+    result.m[2][1] = s15Fixed16Number_to_float(lut.e21);
+    result.m[2][2] = s15Fixed16Number_to_float(lut.e22);
+    result
+}
+fn build_mAB_matrix(lut: &lutmABType) -> Matrix {
+    let mut result: Matrix = Matrix { m: [[0.; 3]; 3] };
+
+    result.m[0][0] = s15Fixed16Number_to_float(lut.e00);
+    result.m[0][1] = s15Fixed16Number_to_float(lut.e01);
+    result.m[0][2] = s15Fixed16Number_to_float(lut.e02);
+    result.m[1][0] = s15Fixed16Number_to_float(lut.e10);
+    result.m[1][1] = s15Fixed16Number_to_float(lut.e11);
+    result.m[1][2] = s15Fixed16Number_to_float(lut.e12);
+    result.m[2][0] = s15Fixed16Number_to_float(lut.e20);
+    result.m[2][1] = s15Fixed16Number_to_float(lut.e21);
+    result.m[2][2] = s15Fixed16Number_to_float(lut.e22);
+
+    result
+}
+//Based on lcms cmsLab2XYZ
+fn f(t: f32) -> f32 {
+    if t <= 24. / 116. * (24. / 116.) * (24. / 116.) {
+        (841. / 108. * t) + 16. / 116.
+    } else {
+        t.powf(1. / 3.)
+    }
+}
+fn f_1(t: f32) -> f32 {
+    if t <= 24.0 / 116.0 {
+        (108.0 / 841.0) * (t - 16.0 / 116.0)
+    } else {
+        t * t * t
+    }
+}
+
+#[allow(clippy::upper_case_acronyms)]
+struct LABtoXYZ;
+impl ModularTransform for LABtoXYZ {
+    fn transform(&self, src: &[f32], dest: &mut [f32]) {
+        // lcms: D50 XYZ values
+        let WhitePointX: f32 = 0.9642;
+        let WhitePointY: f32 = 1.0;
+        let WhitePointZ: f32 = 0.8249;
+
+        for (dest, src) in dest.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            let device_L: f32 = src[0] * 100.0;
+            let device_a: f32 = src[1] * 255.0 - 128.0;
+            let device_b: f32 = src[2] * 255.0 - 128.0;
+
+            let y: f32 = (device_L + 16.0) / 116.0;
+
+            let X = f_1(y + 0.002 * device_a) * WhitePointX;
+            let Y = f_1(y) * WhitePointY;
+            let Z = f_1(y - 0.005 * device_b) * WhitePointZ;
+
+            dest[0] = (X as f64 / (1.0f64 + 32767.0f64 / 32768.0f64)) as f32;
+            dest[1] = (Y as f64 / (1.0f64 + 32767.0f64 / 32768.0f64)) as f32;
+            dest[2] = (Z as f64 / (1.0f64 + 32767.0f64 / 32768.0f64)) as f32;
+        }
+    }
+}
+
+#[allow(clippy::upper_case_acronyms)]
+struct XYZtoLAB;
+impl ModularTransform for XYZtoLAB {
+    //Based on lcms cmsXYZ2Lab
+    fn transform(&self, src: &[f32], dest: &mut [f32]) {
+        // lcms: D50 XYZ values
+        let WhitePointX: f32 = 0.9642;
+        let WhitePointY: f32 = 1.0;
+        let WhitePointZ: f32 = 0.8249;
+        for (dest, src) in dest.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            let device_x: f32 =
+                (src[0] as f64 * (1.0f64 + 32767.0f64 / 32768.0f64) / WhitePointX as f64) as f32;
+            let device_y: f32 =
+                (src[1] as f64 * (1.0f64 + 32767.0f64 / 32768.0f64) / WhitePointY as f64) as f32;
+            let device_z: f32 =
+                (src[2] as f64 * (1.0f64 + 32767.0f64 / 32768.0f64) / WhitePointZ as f64) as f32;
+
+            let fx = f(device_x);
+            let fy = f(device_y);
+            let fz = f(device_z);
+
+            let L: f32 = 116.0 * fy - 16.0;
+            let a: f32 = 500.0 * (fx - fy);
+            let b: f32 = 200.0 * (fy - fz);
+
+            dest[0] = L / 100.0;
+            dest[1] = (a + 128.0) / 255.0;
+            dest[2] = (b + 128.0) / 255.0;
+        }
+    }
+}
+#[derive(Default)]
+struct ClutOnly {
+    clut: Option<Vec<f32>>,
+    grid_size: u16,
+}
+impl ModularTransform for ClutOnly {
+    fn transform(&self, src: &[f32], dest: &mut [f32]) {
+        let xy_len: i32 = 1;
+        let x_len: i32 = self.grid_size as i32;
+        let len: i32 = x_len * x_len;
+
+        let r_table = &self.clut.as_ref().unwrap()[0..];
+        let g_table = &self.clut.as_ref().unwrap()[1..];
+        let b_table = &self.clut.as_ref().unwrap()[2..];
+
+        let CLU = |table: &[f32], x, y, z| table[((x * len + y * x_len + z * xy_len) * 3) as usize];
+
+        for (dest, src) in dest.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            debug_assert!(self.grid_size as i32 >= 1);
+            let linear_r: f32 = src[0];
+            let linear_g: f32 = src[1];
+            let linear_b: f32 = src[2];
+            let x: i32 = (linear_r * (self.grid_size as i32 - 1) as f32).floor() as i32;
+            let y: i32 = (linear_g * (self.grid_size as i32 - 1) as f32).floor() as i32;
+            let z: i32 = (linear_b * (self.grid_size as i32 - 1) as f32).floor() as i32;
+            let x_n: i32 = (linear_r * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+            let y_n: i32 = (linear_g * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+            let z_n: i32 = (linear_b * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+            let x_d: f32 = linear_r * (self.grid_size as i32 - 1) as f32 - x as f32;
+            let y_d: f32 = linear_g * (self.grid_size as i32 - 1) as f32 - y as f32;
+            let z_d: f32 = linear_b * (self.grid_size as i32 - 1) as f32 - z as f32;
+
+            let r_x1: f32 = lerp(CLU(r_table, x, y, z), CLU(r_table, x_n, y, z), x_d);
+            let r_x2: f32 = lerp(CLU(r_table, x, y_n, z), CLU(r_table, x_n, y_n, z), x_d);
+            let r_y1: f32 = lerp(r_x1, r_x2, y_d);
+            let r_x3: f32 = lerp(CLU(r_table, x, y, z_n), CLU(r_table, x_n, y, z_n), x_d);
+            let r_x4: f32 = lerp(CLU(r_table, x, y_n, z_n), CLU(r_table, x_n, y_n, z_n), x_d);
+            let r_y2: f32 = lerp(r_x3, r_x4, y_d);
+            let clut_r: f32 = lerp(r_y1, r_y2, z_d);
+
+            let g_x1: f32 = lerp(CLU(g_table, x, y, z), CLU(g_table, x_n, y, z), x_d);
+            let g_x2: f32 = lerp(CLU(g_table, x, y_n, z), CLU(g_table, x_n, y_n, z), x_d);
+            let g_y1: f32 = lerp(g_x1, g_x2, y_d);
+            let g_x3: f32 = lerp(CLU(g_table, x, y, z_n), CLU(g_table, x_n, y, z_n), x_d);
+            let g_x4: f32 = lerp(CLU(g_table, x, y_n, z_n), CLU(g_table, x_n, y_n, z_n), x_d);
+            let g_y2: f32 = lerp(g_x3, g_x4, y_d);
+            let clut_g: f32 = lerp(g_y1, g_y2, z_d);
+
+            let b_x1: f32 = lerp(CLU(b_table, x, y, z), CLU(b_table, x_n, y, z), x_d);
+            let b_x2: f32 = lerp(CLU(b_table, x, y_n, z), CLU(b_table, x_n, y_n, z), x_d);
+            let b_y1: f32 = lerp(b_x1, b_x2, y_d);
+            let b_x3: f32 = lerp(CLU(b_table, x, y, z_n), CLU(b_table, x_n, y, z_n), x_d);
+            let b_x4: f32 = lerp(CLU(b_table, x, y_n, z_n), CLU(b_table, x_n, y_n, z_n), x_d);
+            let b_y2: f32 = lerp(b_x3, b_x4, y_d);
+            let clut_b: f32 = lerp(b_y1, b_y2, z_d);
+
+            dest[0] = clamp_float(clut_r);
+            dest[1] = clamp_float(clut_g);
+            dest[2] = clamp_float(clut_b);
+        }
+    }
+}
+#[derive(Default)]
+struct Clut3x3 {
+    input_clut_table: [Option<Vec<f32>>; 3],
+    clut: Option<Vec<f32>>,
+    grid_size: u16,
+    output_clut_table: [Option<Vec<f32>>; 3],
+}
+impl ModularTransform for Clut3x3 {
+    fn transform(&self, src: &[f32], dest: &mut [f32]) {
+        let xy_len: i32 = 1;
+        let x_len: i32 = self.grid_size as i32;
+        let len: i32 = x_len * x_len;
+
+        let r_table = &self.clut.as_ref().unwrap()[0..];
+        let g_table = &self.clut.as_ref().unwrap()[1..];
+        let b_table = &self.clut.as_ref().unwrap()[2..];
+        let CLU = |table: &[f32], x, y, z| table[((x * len + y * x_len + z * xy_len) * 3) as usize];
+
+        let input_clut_table_r = self.input_clut_table[0].as_ref().unwrap();
+        let input_clut_table_g = self.input_clut_table[1].as_ref().unwrap();
+        let input_clut_table_b = self.input_clut_table[2].as_ref().unwrap();
+        for (dest, src) in dest.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            debug_assert!(self.grid_size as i32 >= 1);
+            let device_r: f32 = src[0];
+            let device_g: f32 = src[1];
+            let device_b: f32 = src[2];
+            let linear_r: f32 = lut_interp_linear_float(device_r, &input_clut_table_r);
+            let linear_g: f32 = lut_interp_linear_float(device_g, &input_clut_table_g);
+            let linear_b: f32 = lut_interp_linear_float(device_b, &input_clut_table_b);
+            let x: i32 = (linear_r * (self.grid_size as i32 - 1) as f32).floor() as i32;
+            let y: i32 = (linear_g * (self.grid_size as i32 - 1) as f32).floor() as i32;
+            let z: i32 = (linear_b * (self.grid_size as i32 - 1) as f32).floor() as i32;
+            let x_n: i32 = (linear_r * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+            let y_n: i32 = (linear_g * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+            let z_n: i32 = (linear_b * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+            let x_d: f32 = linear_r * (self.grid_size as i32 - 1) as f32 - x as f32;
+            let y_d: f32 = linear_g * (self.grid_size as i32 - 1) as f32 - y as f32;
+            let z_d: f32 = linear_b * (self.grid_size as i32 - 1) as f32 - z as f32;
+
+            let r_x1: f32 = lerp(CLU(r_table, x, y, z), CLU(r_table, x_n, y, z), x_d);
+            let r_x2: f32 = lerp(CLU(r_table, x, y_n, z), CLU(r_table, x_n, y_n, z), x_d);
+            let r_y1: f32 = lerp(r_x1, r_x2, y_d);
+            let r_x3: f32 = lerp(CLU(r_table, x, y, z_n), CLU(r_table, x_n, y, z_n), x_d);
+            let r_x4: f32 = lerp(CLU(r_table, x, y_n, z_n), CLU(r_table, x_n, y_n, z_n), x_d);
+            let r_y2: f32 = lerp(r_x3, r_x4, y_d);
+            let clut_r: f32 = lerp(r_y1, r_y2, z_d);
+
+            let g_x1: f32 = lerp(CLU(g_table, x, y, z), CLU(g_table, x_n, y, z), x_d);
+            let g_x2: f32 = lerp(CLU(g_table, x, y_n, z), CLU(g_table, x_n, y_n, z), x_d);
+            let g_y1: f32 = lerp(g_x1, g_x2, y_d);
+            let g_x3: f32 = lerp(CLU(g_table, x, y, z_n), CLU(g_table, x_n, y, z_n), x_d);
+            let g_x4: f32 = lerp(CLU(g_table, x, y_n, z_n), CLU(g_table, x_n, y_n, z_n), x_d);
+            let g_y2: f32 = lerp(g_x3, g_x4, y_d);
+            let clut_g: f32 = lerp(g_y1, g_y2, z_d);
+
+            let b_x1: f32 = lerp(CLU(b_table, x, y, z), CLU(b_table, x_n, y, z), x_d);
+            let b_x2: f32 = lerp(CLU(b_table, x, y_n, z), CLU(b_table, x_n, y_n, z), x_d);
+            let b_y1: f32 = lerp(b_x1, b_x2, y_d);
+            let b_x3: f32 = lerp(CLU(b_table, x, y, z_n), CLU(b_table, x_n, y, z_n), x_d);
+            let b_x4: f32 = lerp(CLU(b_table, x, y_n, z_n), CLU(b_table, x_n, y_n, z_n), x_d);
+            let b_y2: f32 = lerp(b_x3, b_x4, y_d);
+            let clut_b: f32 = lerp(b_y1, b_y2, z_d);
+            let pcs_r: f32 =
+                lut_interp_linear_float(clut_r, &self.output_clut_table[0].as_ref().unwrap());
+            let pcs_g: f32 =
+                lut_interp_linear_float(clut_g, &self.output_clut_table[1].as_ref().unwrap());
+            let pcs_b: f32 =
+                lut_interp_linear_float(clut_b, &self.output_clut_table[2].as_ref().unwrap());
+            dest[0] = clamp_float(pcs_r);
+            dest[1] = clamp_float(pcs_g);
+            dest[2] = clamp_float(pcs_b);
+        }
+    }
+}
+#[derive(Default)]
+struct Clut4x3 {
+    input_clut_table: [Option<Vec<f32>>; 4],
+    clut: Option<Vec<f32>>,
+    grid_size: u16,
+    output_clut_table: [Option<Vec<f32>>; 3],
+}
+impl ModularTransform for Clut4x3 {
+    fn transform(&self, src: &[f32], dest: &mut [f32]) {
+        let z_stride: i32 = self.grid_size as i32;
+        let y_stride: i32 = z_stride * z_stride;
+        let x_stride: i32 = z_stride * z_stride * z_stride;
+
+        let r_tbl = &self.clut.as_ref().unwrap()[0..];
+        let g_tbl = &self.clut.as_ref().unwrap()[1..];
+        let b_tbl = &self.clut.as_ref().unwrap()[2..];
+
+        let CLU = |table: &[f32], x, y, z, w| {
+            table[((x * x_stride + y * y_stride + z * z_stride + w) * 3) as usize]
+        };
+
+        let input_clut_table_0 = self.input_clut_table[0].as_ref().unwrap();
+        let input_clut_table_1 = self.input_clut_table[1].as_ref().unwrap();
+        let input_clut_table_2 = self.input_clut_table[2].as_ref().unwrap();
+        let input_clut_table_3 = self.input_clut_table[3].as_ref().unwrap();
+        for (dest, src) in dest.chunks_exact_mut(3).zip(src.chunks_exact(4)) {
+            debug_assert!(self.grid_size as i32 >= 1);
+            let linear_x: f32 = lut_interp_linear_float(src[0], &input_clut_table_0);
+            let linear_y: f32 = lut_interp_linear_float(src[1], &input_clut_table_1);
+            let linear_z: f32 = lut_interp_linear_float(src[2], &input_clut_table_2);
+            let linear_w: f32 = lut_interp_linear_float(src[3], &input_clut_table_3);
+
+            let x: i32 = (linear_x * (self.grid_size as i32 - 1) as f32).floor() as i32;
+            let y: i32 = (linear_y * (self.grid_size as i32 - 1) as f32).floor() as i32;
+            let z: i32 = (linear_z * (self.grid_size as i32 - 1) as f32).floor() as i32;
+            let w: i32 = (linear_w * (self.grid_size as i32 - 1) as f32).floor() as i32;
+
+            let x_n: i32 = (linear_x * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+            let y_n: i32 = (linear_y * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+            let z_n: i32 = (linear_z * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+            let w_n: i32 = (linear_w * (self.grid_size as i32 - 1) as f32).ceil() as i32;
+
+            let x_d: f32 = linear_x * (self.grid_size as i32 - 1) as f32 - x as f32;
+            let y_d: f32 = linear_y * (self.grid_size as i32 - 1) as f32 - y as f32;
+            let z_d: f32 = linear_z * (self.grid_size as i32 - 1) as f32 - z as f32;
+            let w_d: f32 = linear_w * (self.grid_size as i32 - 1) as f32 - w as f32;
+
+            let quadlinear = |tbl| {
+                let CLU = |x, y, z, w| CLU(tbl, x, y, z, w);
+                let r_x1 = lerp(CLU(x, y, z, w), CLU(x_n, y, z, w), x_d);
+                let r_x2 = lerp(CLU(x, y_n, z, w), CLU(x_n, y_n, z, w), x_d);
+                let r_y1 = lerp(r_x1, r_x2, y_d);
+                let r_x3 = lerp(CLU(x, y, z_n, w), CLU(x_n, y, z_n, w), x_d);
+                let r_x4 = lerp(CLU(x, y_n, z_n, w), CLU(x_n, y_n, z_n, w), x_d);
+                let r_y2 = lerp(r_x3, r_x4, y_d);
+                let r_z1 = lerp(r_y1, r_y2, z_d);
+
+                let r_x1 = lerp(CLU(x, y, z, w_n), CLU(x_n, y, z, w_n), x_d);
+                let r_x2 = lerp(CLU(x, y_n, z, w_n), CLU(x_n, y_n, z, w_n), x_d);
+                let r_y1 = lerp(r_x1, r_x2, y_d);
+                let r_x3 = lerp(CLU(x, y, z_n, w_n), CLU(x_n, y, z_n, w_n), x_d);
+                let r_x4 = lerp(CLU(x, y_n, z_n, w_n), CLU(x_n, y_n, z_n, w_n), x_d);
+                let r_y2 = lerp(r_x3, r_x4, y_d);
+                let r_z2 = lerp(r_y1, r_y2, z_d);
+                lerp(r_z1, r_z2, w_d)
+            };
+            // TODO: instead of reading each component separately we should read all three components at once.
+            let clut_r = quadlinear(r_tbl);
+            let clut_g = quadlinear(g_tbl);
+            let clut_b = quadlinear(b_tbl);
+
+            let pcs_r =
+                lut_interp_linear_float(clut_r, &self.output_clut_table[0].as_ref().unwrap());
+            let pcs_g =
+                lut_interp_linear_float(clut_g, &self.output_clut_table[1].as_ref().unwrap());
+            let pcs_b =
+                lut_interp_linear_float(clut_b, &self.output_clut_table[2].as_ref().unwrap());
+            dest[0] = clamp_float(pcs_r);
+            dest[1] = clamp_float(pcs_g);
+            dest[2] = clamp_float(pcs_b);
+        }
+    }
+}
+/* NOT USED
+static void qcms_transform_module_tetra_clut(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+    size_t i;
+    int xy_len = 1;
+    int x_len = transform->grid_size;
+    int len = x_len * x_len;
+    float* r_table = transform->r_clut;
+    float* g_table = transform->g_clut;
+    float* b_table = transform->b_clut;
+    float c0_r, c1_r, c2_r, c3_r;
+    float c0_g, c1_g, c2_g, c3_g;
+    float c0_b, c1_b, c2_b, c3_b;
+    float clut_r, clut_g, clut_b;
+    float pcs_r, pcs_g, pcs_b;
+    for (i = 0; i < length; i++) {
+        float device_r = *src++;
+        float device_g = *src++;
+        float device_b = *src++;
+        float linear_r = lut_interp_linear_float(device_r,
+                transform->input_clut_table_r, transform->input_clut_table_length);
+        float linear_g = lut_interp_linear_float(device_g,
+                transform->input_clut_table_g, transform->input_clut_table_length);
+        float linear_b = lut_interp_linear_float(device_b,
+                transform->input_clut_table_b, transform->input_clut_table_length);
+
+        int x = floorf(linear_r * (transform->grid_size-1));
+        int y = floorf(linear_g * (transform->grid_size-1));
+        int z = floorf(linear_b * (transform->grid_size-1));
+        int x_n = ceilf(linear_r * (transform->grid_size-1));
+        int y_n = ceilf(linear_g * (transform->grid_size-1));
+        int z_n = ceilf(linear_b * (transform->grid_size-1));
+        float rx = linear_r * (transform->grid_size-1) - x;
+        float ry = linear_g * (transform->grid_size-1) - y;
+        float rz = linear_b * (transform->grid_size-1) - z;
+
+        c0_r = CLU(r_table, x, y, z);
+        c0_g = CLU(g_table, x, y, z);
+        c0_b = CLU(b_table, x, y, z);
+        if( rx >= ry ) {
+            if (ry >= rz) { //rx >= ry && ry >= rz
+                c1_r = CLU(r_table, x_n, y, z) - c0_r;
+                c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
+                c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+                c1_g = CLU(g_table, x_n, y, z) - c0_g;
+                c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
+                c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+                c1_b = CLU(b_table, x_n, y, z) - c0_b;
+                c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
+                c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+            } else {
+                if (rx >= rz) { //rx >= rz && rz >= ry
+                    c1_r = CLU(r_table, x_n, y, z) - c0_r;
+                    c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+                    c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
+                    c1_g = CLU(g_table, x_n, y, z) - c0_g;
+                    c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+                    c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
+                    c1_b = CLU(b_table, x_n, y, z) - c0_b;
+                    c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+                    c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
+                } else { //rz > rx && rx >= ry
+                    c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
+                    c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+                    c3_r = CLU(r_table, x, y, z_n) - c0_r;
+                    c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
+                    c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+                    c3_g = CLU(g_table, x, y, z_n) - c0_g;
+                    c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
+                    c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+                    c3_b = CLU(b_table, x, y, z_n) - c0_b;
+                }
+            }
+        } else {
+            if (rx >= rz) { //ry > rx && rx >= rz
+                c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
+                c2_r = CLU(r_table, x_n, y_n, z) - c0_r;
+                c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+                c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
+                c2_g = CLU(g_table, x_n, y_n, z) - c0_g;
+                c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+                c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
+                c2_b = CLU(b_table, x_n, y_n, z) - c0_b;
+                c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+            } else {
+                if (ry >= rz) { //ry >= rz && rz > rx
+                    c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+                    c2_r = CLU(r_table, x, y_n, z) - c0_r;
+                    c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
+                    c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+                    c2_g = CLU(g_table, x, y_n, z) - c0_g;
+                    c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
+                    c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+                    c2_b = CLU(b_table, x, y_n, z) - c0_b;
+                    c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
+                } else { //rz > ry && ry > rx
+                    c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+                    c2_r = CLU(r_table, x, y_n, z) - c0_r;
+                    c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+                    c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+                    c2_g = CLU(g_table, x, y_n, z) - c0_g;
+                    c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+                    c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+                    c2_b = CLU(b_table, x, y_n, z) - c0_b;
+                    c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+                }
+            }
+        }
+
+        clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
+        clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
+        clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
+
+        pcs_r = lut_interp_linear_float(clut_r,
+                transform->output_clut_table_r, transform->output_clut_table_length);
+        pcs_g = lut_interp_linear_float(clut_g,
+                transform->output_clut_table_g, transform->output_clut_table_length);
+        pcs_b = lut_interp_linear_float(clut_b,
+                transform->output_clut_table_b, transform->output_clut_table_length);
+        *dest++ = clamp_float(pcs_r);
+        *dest++ = clamp_float(pcs_g);
+        *dest++ = clamp_float(pcs_b);
+    }
+}
+*/
+#[derive(Default)]
+struct GammaTable {
+    input_clut_table: [Option<Vec<f32>>; 3],
+}
+impl ModularTransform for GammaTable {
+    fn transform(&self, src: &[f32], dest: &mut [f32]) {
+        let mut out_r: f32;
+        let mut out_g: f32;
+        let mut out_b: f32;
+        let input_clut_table_r = self.input_clut_table[0].as_ref().unwrap();
+        let input_clut_table_g = self.input_clut_table[1].as_ref().unwrap();
+        let input_clut_table_b = self.input_clut_table[2].as_ref().unwrap();
+
+        for (dest, src) in dest.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            let in_r: f32 = src[0];
+            let in_g: f32 = src[1];
+            let in_b: f32 = src[2];
+            out_r = lut_interp_linear_float(in_r, input_clut_table_r);
+            out_g = lut_interp_linear_float(in_g, input_clut_table_g);
+            out_b = lut_interp_linear_float(in_b, input_clut_table_b);
+
+            dest[0] = clamp_float(out_r);
+            dest[1] = clamp_float(out_g);
+            dest[2] = clamp_float(out_b);
+        }
+    }
+}
+#[derive(Default)]
+struct GammaLut {
+    output_gamma_lut_r: Option<Vec<u16>>,
+    output_gamma_lut_g: Option<Vec<u16>>,
+    output_gamma_lut_b: Option<Vec<u16>>,
+}
+impl ModularTransform for GammaLut {
+    fn transform(&self, src: &[f32], dest: &mut [f32]) {
+        let mut out_r: f32;
+        let mut out_g: f32;
+        let mut out_b: f32;
+        for (dest, src) in dest.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            let in_r: f32 = src[0];
+            let in_g: f32 = src[1];
+            let in_b: f32 = src[2];
+            out_r = lut_interp_linear(in_r as f64, &self.output_gamma_lut_r.as_ref().unwrap());
+            out_g = lut_interp_linear(in_g as f64, &self.output_gamma_lut_g.as_ref().unwrap());
+            out_b = lut_interp_linear(in_b as f64, &self.output_gamma_lut_b.as_ref().unwrap());
+            dest[0] = clamp_float(out_r);
+            dest[1] = clamp_float(out_g);
+            dest[2] = clamp_float(out_b);
+        }
+    }
+}
+#[derive(Default)]
+struct MatrixTranslate {
+    matrix: Matrix,
+    tx: f32,
+    ty: f32,
+    tz: f32,
+}
+impl ModularTransform for MatrixTranslate {
+    fn transform(&self, src: &[f32], dest: &mut [f32]) {
+        let mut mat: Matrix = Matrix { m: [[0.; 3]; 3] };
+        /* store the results in column major mode
+         * this makes doing the multiplication with sse easier */
+        mat.m[0][0] = self.matrix.m[0][0];
+        mat.m[1][0] = self.matrix.m[0][1];
+        mat.m[2][0] = self.matrix.m[0][2];
+        mat.m[0][1] = self.matrix.m[1][0];
+        mat.m[1][1] = self.matrix.m[1][1];
+        mat.m[2][1] = self.matrix.m[1][2];
+        mat.m[0][2] = self.matrix.m[2][0];
+        mat.m[1][2] = self.matrix.m[2][1];
+        mat.m[2][2] = self.matrix.m[2][2];
+        for (dest, src) in dest.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            let in_r: f32 = src[0];
+            let in_g: f32 = src[1];
+            let in_b: f32 = src[2];
+            let out_r: f32 = mat.m[0][0] * in_r + mat.m[1][0] * in_g + mat.m[2][0] * in_b + self.tx;
+            let out_g: f32 = mat.m[0][1] * in_r + mat.m[1][1] * in_g + mat.m[2][1] * in_b + self.ty;
+            let out_b: f32 = mat.m[0][2] * in_r + mat.m[1][2] * in_g + mat.m[2][2] * in_b + self.tz;
+            dest[0] = clamp_float(out_r);
+            dest[1] = clamp_float(out_g);
+            dest[2] = clamp_float(out_b);
+        }
+    }
+}
+#[derive(Default)]
+struct MatrixTransform {
+    matrix: Matrix,
+}
+impl ModularTransform for MatrixTransform {
+    fn transform(&self, src: &[f32], dest: &mut [f32]) {
+        let mut mat: Matrix = Matrix { m: [[0.; 3]; 3] };
+        /* store the results in column major mode
+         * this makes doing the multiplication with sse easier */
+        mat.m[0][0] = self.matrix.m[0][0];
+        mat.m[1][0] = self.matrix.m[0][1];
+        mat.m[2][0] = self.matrix.m[0][2];
+        mat.m[0][1] = self.matrix.m[1][0];
+        mat.m[1][1] = self.matrix.m[1][1];
+        mat.m[2][1] = self.matrix.m[1][2];
+        mat.m[0][2] = self.matrix.m[2][0];
+        mat.m[1][2] = self.matrix.m[2][1];
+        mat.m[2][2] = self.matrix.m[2][2];
+        for (dest, src) in dest.chunks_exact_mut(3).zip(src.chunks_exact(3)) {
+            let in_r: f32 = src[0];
+            let in_g: f32 = src[1];
+            let in_b: f32 = src[2];
+            let out_r: f32 = mat.m[0][0] * in_r + mat.m[1][0] * in_g + mat.m[2][0] * in_b;
+            let out_g: f32 = mat.m[0][1] * in_r + mat.m[1][1] * in_g + mat.m[2][1] * in_b;
+            let out_b: f32 = mat.m[0][2] * in_r + mat.m[1][2] * in_g + mat.m[2][2] * in_b;
+            dest[0] = clamp_float(out_r);
+            dest[1] = clamp_float(out_g);
+            dest[2] = clamp_float(out_b);
+        }
+    }
+}
+
+fn modular_transform_create_mAB(lut: &lutmABType) -> Option<Vec<Box<dyn ModularTransform>>> {
+    let mut transforms: Vec<Box<dyn ModularTransform>> = Vec::new();
+    if lut.a_curves[0].is_some() {
+        let clut_length: usize;
+        // If the A curve is present this also implies the
+        // presence of a CLUT.
+        lut.clut_table.as_ref()?;
+
+        // Prepare A curve.
+        let mut transform = Box::new(GammaTable::default());
+        transform.input_clut_table[0] = build_input_gamma_table(lut.a_curves[0].as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+        transform.input_clut_table[1] = build_input_gamma_table(lut.a_curves[1].as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+        transform.input_clut_table[2] = build_input_gamma_table(lut.a_curves[2].as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+
+        if lut.num_grid_points[0] as i32 != lut.num_grid_points[1] as i32
+            || lut.num_grid_points[1] as i32 != lut.num_grid_points[2] as i32
+        {
+            //XXX: We don't currently support clut that are not squared!
+            return None;
+        }
+        transforms.push(transform);
+
+        // Prepare CLUT
+        let mut transform = Box::new(ClutOnly::default());
+        clut_length = (lut.num_grid_points[0] as usize).pow(3) * 3;
+        assert_eq!(clut_length, lut.clut_table.as_ref().unwrap().len());
+        transform.clut = lut.clut_table.clone();
+        transform.grid_size = lut.num_grid_points[0] as u16;
+        transforms.push(transform);
+    }
+
+    if lut.m_curves[0].is_some() {
+        // M curve imples the presence of a Matrix
+
+        // Prepare M curve
+        let mut transform = Box::new(GammaTable::default());
+        transform.input_clut_table[0] = build_input_gamma_table(lut.m_curves[0].as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+        transform.input_clut_table[1] = build_input_gamma_table(lut.m_curves[1].as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+        transform.input_clut_table[2] = build_input_gamma_table(lut.m_curves[2].as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+        transforms.push(transform);
+
+        // Prepare Matrix
+        let mut transform = Box::new(MatrixTranslate::default());
+        transform.matrix = build_mAB_matrix(lut);
+        transform.tx = s15Fixed16Number_to_float(lut.e03);
+        transform.ty = s15Fixed16Number_to_float(lut.e13);
+        transform.tz = s15Fixed16Number_to_float(lut.e23);
+        transforms.push(transform);
+    }
+
+    if lut.b_curves[0].is_some() {
+        // Prepare B curve
+        let mut transform = Box::new(GammaTable::default());
+        transform.input_clut_table[0] = build_input_gamma_table(lut.b_curves[0].as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+        transform.input_clut_table[1] = build_input_gamma_table(lut.b_curves[1].as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+        transform.input_clut_table[2] = build_input_gamma_table(lut.b_curves[2].as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+        transforms.push(transform);
+    } else {
+        // B curve is mandatory
+        return None;
+    }
+
+    if lut.reversed {
+        // mBA are identical to mAB except that the transformation order
+        // is reversed
+        transforms.reverse();
+    }
+    Some(transforms)
+}
+
+fn modular_transform_create_lut(lut: &lutType) -> Option<Vec<Box<dyn ModularTransform>>> {
+    let mut transforms: Vec<Box<dyn ModularTransform>> = Vec::new();
+
+    let clut_length: usize;
+    let mut transform = Box::new(MatrixTransform::default());
+
+    transform.matrix = build_lut_matrix(lut);
+    if true {
+        transforms.push(transform);
+
+        // Prepare input curves
+        let mut transform = Box::new(Clut3x3::default());
+        transform.input_clut_table[0] =
+            Some(lut.input_table[0..lut.num_input_table_entries as usize].to_vec());
+        transform.input_clut_table[1] = Some(
+            lut.input_table
+                [lut.num_input_table_entries as usize..lut.num_input_table_entries as usize * 2]
+                .to_vec(),
+        );
+        transform.input_clut_table[2] = Some(
+            lut.input_table[lut.num_input_table_entries as usize * 2
+                ..lut.num_input_table_entries as usize * 3]
+                .to_vec(),
+        );
+        // Prepare table
+        clut_length = (lut.num_clut_grid_points as usize).pow(3) * 3;
+        assert_eq!(clut_length, lut.clut_table.len());
+        transform.clut = Some(lut.clut_table.clone());
+
+        transform.grid_size = lut.num_clut_grid_points as u16;
+        // Prepare output curves
+        transform.output_clut_table[0] =
+            Some(lut.output_table[0..lut.num_output_table_entries as usize].to_vec());
+        transform.output_clut_table[1] = Some(
+            lut.output_table
+                [lut.num_output_table_entries as usize..lut.num_output_table_entries as usize * 2]
+                .to_vec(),
+        );
+        transform.output_clut_table[2] = Some(
+            lut.output_table[lut.num_output_table_entries as usize * 2
+                ..lut.num_output_table_entries as usize * 3]
+                .to_vec(),
+        );
+        transforms.push(transform);
+        return Some(transforms);
+    }
+    None
+}
+
+fn modular_transform_create_lut4x3(lut: &lutType) -> Vec<Box<dyn ModularTransform>> {
+    let mut transforms: Vec<Box<dyn ModularTransform>> = Vec::new();
+
+    let clut_length: usize;
+    // the matrix of lutType is only used when the input color space is XYZ.
+
+    // Prepare input curves
+    let mut transform = Box::new(Clut4x3::default());
+    transform.input_clut_table[0] =
+        Some(lut.input_table[0..lut.num_input_table_entries as usize].to_vec());
+    transform.input_clut_table[1] = Some(
+        lut.input_table
+            [lut.num_input_table_entries as usize..lut.num_input_table_entries as usize * 2]
+            .to_vec(),
+    );
+    transform.input_clut_table[2] = Some(
+        lut.input_table
+            [lut.num_input_table_entries as usize * 2..lut.num_input_table_entries as usize * 3]
+            .to_vec(),
+    );
+    transform.input_clut_table[3] = Some(
+        lut.input_table
+            [lut.num_input_table_entries as usize * 3..lut.num_input_table_entries as usize * 4]
+            .to_vec(),
+    );
+    // Prepare table
+    clut_length = (lut.num_clut_grid_points as usize).pow(lut.num_input_channels as u32)
+        * lut.num_output_channels as usize;
+    assert_eq!(clut_length, lut.clut_table.len());
+    transform.clut = Some(lut.clut_table.clone());
+
+    transform.grid_size = lut.num_clut_grid_points as u16;
+    // Prepare output curves
+    transform.output_clut_table[0] =
+        Some(lut.output_table[0..lut.num_output_table_entries as usize].to_vec());
+    transform.output_clut_table[1] = Some(
+        lut.output_table
+            [lut.num_output_table_entries as usize..lut.num_output_table_entries as usize * 2]
+            .to_vec(),
+    );
+    transform.output_clut_table[2] = Some(
+        lut.output_table
+            [lut.num_output_table_entries as usize * 2..lut.num_output_table_entries as usize * 3]
+            .to_vec(),
+    );
+    transforms.push(transform);
+    transforms
+}
+
+fn modular_transform_create_input(input: &Profile) -> Option<Vec<Box<dyn ModularTransform>>> {
+    let mut transforms = Vec::new();
+    if let Some(A2B0) = &input.A2B0 {
+        let lut_transform;
+        if A2B0.num_input_channels == 4 {
+            lut_transform = Some(modular_transform_create_lut4x3(&A2B0));
+        } else {
+            lut_transform = modular_transform_create_lut(&A2B0);
+        }
+        if let Some(lut_transform) = lut_transform {
+            transforms.extend(lut_transform);
+        } else {
+            return None;
+        }
+    } else if input.mAB.is_some()
+        && (*input.mAB.as_deref().unwrap()).num_in_channels == 3
+        && (*input.mAB.as_deref().unwrap()).num_out_channels == 3
+    {
+        let mAB_transform = modular_transform_create_mAB(input.mAB.as_deref().unwrap());
+        if let Some(mAB_transform) = mAB_transform {
+            transforms.extend(mAB_transform);
+        } else {
+            return None;
+        }
+    } else {
+        let mut transform = Box::new(GammaTable::default());
+        transform.input_clut_table[0] =
+            build_input_gamma_table(input.redTRC.as_deref()).map(|x| (x as Box<[f32]>).into_vec());
+        transform.input_clut_table[1] = build_input_gamma_table(input.greenTRC.as_deref())
+            .map(|x| (x as Box<[f32]>).into_vec());
+        transform.input_clut_table[2] =
+            build_input_gamma_table(input.blueTRC.as_deref()).map(|x| (x as Box<[f32]>).into_vec());
+        if transform.input_clut_table[0].is_none()
+            || transform.input_clut_table[1].is_none()
+            || transform.input_clut_table[2].is_none()
+        {
+            return None;
+        } else {
+            transforms.push(transform);
+
+            let mut transform = Box::new(MatrixTransform::default());
+            transform.matrix.m[0][0] = 1. / 1.999_969_5;
+            transform.matrix.m[0][1] = 0.0;
+            transform.matrix.m[0][2] = 0.0;
+            transform.matrix.m[1][0] = 0.0;
+            transform.matrix.m[1][1] = 1. / 1.999_969_5;
+            transform.matrix.m[1][2] = 0.0;
+            transform.matrix.m[2][0] = 0.0;
+            transform.matrix.m[2][1] = 0.0;
+            transform.matrix.m[2][2] = 1. / 1.999_969_5;
+            transforms.push(transform);
+
+            let mut transform = Box::new(MatrixTransform::default());
+            transform.matrix = build_colorant_matrix(input);
+            transforms.push(transform);
+        }
+    }
+    Some(transforms)
+}
+fn modular_transform_create_output(out: &Profile) -> Option<Vec<Box<dyn ModularTransform>>> {
+    let mut transforms = Vec::new();
+    if let Some(B2A0) = &out.B2A0 {
+        if B2A0.num_input_channels != 3 || B2A0.num_output_channels != 3 {
+            return None;
+        }
+        let lut_transform = modular_transform_create_lut(B2A0);
+        if let Some(lut_transform) = lut_transform {
+            transforms.extend(lut_transform);
+        } else {
+            return None;
+        }
+    } else if out.mBA.is_some()
+        && (*out.mBA.as_deref().unwrap()).num_in_channels == 3
+        && (*out.mBA.as_deref().unwrap()).num_out_channels == 3
+    {
+        let lut_transform = modular_transform_create_mAB(out.mBA.as_deref().unwrap());
+        if let Some(lut_transform) = lut_transform {
+            transforms.extend(lut_transform)
+        } else {
+            return None;
+        }
+    } else if let (Some(redTRC), Some(greenTRC), Some(blueTRC)) =
+        (&out.redTRC, &out.greenTRC, &out.blueTRC)
+    {
+        let mut transform = Box::new(MatrixTransform::default());
+        transform.matrix = build_colorant_matrix(out).invert()?;
+        transforms.push(transform);
+
+        let mut transform = Box::new(MatrixTransform::default());
+        transform.matrix.m[0][0] = 1.999_969_5;
+        transform.matrix.m[0][1] = 0.0;
+        transform.matrix.m[0][2] = 0.0;
+        transform.matrix.m[1][0] = 0.0;
+        transform.matrix.m[1][1] = 1.999_969_5;
+        transform.matrix.m[1][2] = 0.0;
+        transform.matrix.m[2][0] = 0.0;
+        transform.matrix.m[2][1] = 0.0;
+        transform.matrix.m[2][2] = 1.999_969_5;
+        transforms.push(transform);
+
+        let mut transform = Box::new(GammaLut::default());
+        transform.output_gamma_lut_r = Some(build_output_lut(redTRC)?);
+        transform.output_gamma_lut_g = Some(build_output_lut(greenTRC)?);
+        transform.output_gamma_lut_b = Some(build_output_lut(blueTRC)?);
+        transforms.push(transform);
+    } else {
+        debug_assert!(false, "Unsupported output profile workflow.");
+        return None;
+    }
+    Some(transforms)
+}
+/* Not Completed
+// Simplify the transformation chain to an equivalent transformation chain
+static struct qcms_modular_transform* qcms_modular_transform_reduce(struct qcms_modular_transform *transform)
+{
+    struct qcms_modular_transform *first_transform = NULL;
+    struct qcms_modular_transform *curr_trans = transform;
+    struct qcms_modular_transform *prev_trans = NULL;
+    while (curr_trans) {
+        struct qcms_modular_transform *next_trans = curr_trans->next_transform;
+        if (curr_trans->transform_module_fn == qcms_transform_module_matrix) {
+            if (next_trans && next_trans->transform_module_fn == qcms_transform_module_matrix) {
+                curr_trans->matrix = matrix_multiply(curr_trans->matrix, next_trans->matrix);
+                goto remove_next;
+            }
+        }
+        if (curr_trans->transform_module_fn == qcms_transform_module_gamma_table) {
+            bool isLinear = true;
+            uint16_t i;
+            for (i = 0; isLinear && i < 256; i++) {
+                isLinear &= (int)(curr_trans->input_clut_table_r[i] * 255) == i;
+                isLinear &= (int)(curr_trans->input_clut_table_g[i] * 255) == i;
+                isLinear &= (int)(curr_trans->input_clut_table_b[i] * 255) == i;
+            }
+            goto remove_current;
+        }
+
+next_transform:
+        if (!next_trans) break;
+        prev_trans = curr_trans;
+        curr_trans = next_trans;
+        continue;
+remove_current:
+        if (curr_trans == transform) {
+            //Update head
+            transform = next_trans;
+        } else {
+            prev_trans->next_transform = next_trans;
+        }
+        curr_trans->next_transform = NULL;
+        qcms_modular_transform_release(curr_trans);
+        //return transform;
+        return qcms_modular_transform_reduce(transform);
+remove_next:
+        curr_trans->next_transform = next_trans->next_transform;
+        next_trans->next_transform = NULL;
+        qcms_modular_transform_release(next_trans);
+        continue;
+    }
+    return transform;
+}
+*/
+fn modular_transform_create(
+    input: &Profile,
+    output: &Profile,
+) -> Option<Vec<Box<dyn ModularTransform>>> {
+    let mut transforms = Vec::new();
+    if input.color_space == RGB_SIGNATURE || input.color_space == CMYK_SIGNATURE {
+        let rgb_to_pcs = modular_transform_create_input(input);
+        if let Some(rgb_to_pcs) = rgb_to_pcs {
+            transforms.extend(rgb_to_pcs);
+        } else {
+            return None;
+        }
+    } else {
+        debug_assert!(false, "input color space not supported");
+        return None;
+    }
+
+    if input.pcs == LAB_SIGNATURE && output.pcs == XYZ_SIGNATURE {
+        transforms.push(Box::new(LABtoXYZ {}));
+    }
+
+    // This does not improve accuracy in practice, something is wrong here.
+    //if (in->chromaticAdaption.invalid == false) {
+    //	struct qcms_modular_transform* chromaticAdaption;
+    //	chromaticAdaption = qcms_modular_transform_alloc();
+    //	if (!chromaticAdaption)
+    //		goto fail;
+    //	append_transform(chromaticAdaption, &next_transform);
+    //	chromaticAdaption->matrix = matrix_invert(in->chromaticAdaption);
+    //	chromaticAdaption->transform_module_fn = qcms_transform_module_matrix;
+    //}
+
+    if input.pcs == XYZ_SIGNATURE && output.pcs == LAB_SIGNATURE {
+        transforms.push(Box::new(XYZtoLAB {}));
+    }
+
+    if output.color_space == RGB_SIGNATURE {
+        let pcs_to_rgb = modular_transform_create_output(output);
+        if let Some(pcs_to_rgb) = pcs_to_rgb {
+            transforms.extend(pcs_to_rgb);
+        } else {
+            return None;
+        }
+    } else if output.color_space == CMYK_SIGNATURE {
+        let pcs_to_cmyk = modular_transform_create_output(output)?;
+        transforms.extend(pcs_to_cmyk);
+    } else {
+        debug_assert!(false, "output color space not supported");
+    }
+
+    // Not Completed
+    //return qcms_modular_transform_reduce(first_transform);
+    Some(transforms)
+}
+fn modular_transform_data(
+    transforms: Vec<Box<dyn ModularTransform>>,
+    mut src: Vec<f32>,
+    mut dest: Vec<f32>,
+    _len: usize,
+) -> Vec<f32> {
+    for transform in transforms {
+        // Keep swaping src/dest when performing a transform to use less memory.
+        transform.transform(&src, &mut dest);
+        std::mem::swap(&mut src, &mut dest);
+    }
+    // The results end up in the src buffer because of the switching
+    src
+}
+
+pub fn chain_transform(
+    input: &Profile,
+    output: &Profile,
+    src: Vec<f32>,
+    dest: Vec<f32>,
+    lutSize: usize,
+) -> Option<Vec<f32>> {
+    let transform_list = modular_transform_create(input, output);
+    if let Some(transform_list) = transform_list {
+        let lut = modular_transform_data(transform_list, src, dest, lutSize / 3);
+        return Some(lut);
+    }
+    None
+}
diff --git a/gfx/qcms/src/gtest.rs b/gfx/qcms/src/gtest.rs
new file mode 100644
index 0000000000..bfe350def4
--- /dev/null
+++ b/gfx/qcms/src/gtest.rs
@@ -0,0 +1,962 @@
+#[cfg(all(test, feature = "c_bindings"))]
+#[allow(clippy::all)]
+mod gtest {
+    use crate::{
+        c_bindings::*, iccread::*, transform::DataType::*, transform::*,
+        transform_util::lut_inverse_interp16, Intent::Perceptual,
+    };
+    use libc::c_void;
+    #[cfg(target_arch = "arm")]
+    use std::arch::is_arm_feature_detected;
+    #[cfg(target_arch = "aarch64")]
+    use std::arch::is_aarch64_feature_detected;
+    use std::ptr::null_mut;
+
+    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+    use crate::transform_neon::{
+        qcms_transform_data_bgra_out_lut_neon, qcms_transform_data_rgb_out_lut_neon,
+        qcms_transform_data_rgba_out_lut_neon,
+    };
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    use crate::{
+        transform_avx::{
+            qcms_transform_data_bgra_out_lut_avx, qcms_transform_data_rgb_out_lut_avx,
+            qcms_transform_data_rgba_out_lut_avx,
+        },
+        transform_sse2::{
+            qcms_transform_data_bgra_out_lut_sse2, qcms_transform_data_rgb_out_lut_sse2,
+            qcms_transform_data_rgba_out_lut_sse2,
+        },
+    };
+
+    #[test]
+    fn test_lut_inverse_crash() {
+        let lutTable1: [u16; 128] = [
+            0x0000, 0x0000, 0x0000, 0x8000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        ];
+        let lutTable2: [u16; 128] = [
+            0xFFF0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+            0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        ];
+
+        // Crash/Assert test
+
+        lut_inverse_interp16(5, &lutTable1);
+        lut_inverse_interp16(5, &lutTable2);
+    }
+
+    #[test]
+    fn test_lut_inverse() {
+        // mimic sRGB_v4_ICC mBA Output
+        //
+        //       XXXX
+        //      X
+        //     X
+        // XXXX
+        let mut value: u16;
+        let mut lutTable: [u16; 256] = [0; 256];
+
+        for i in 0..20 {
+            lutTable[i] = 0;
+        }
+
+        for i in 20..200 {
+            lutTable[i] = ((i - 20) * 0xFFFF / (200 - 20)) as u16;
+        }
+
+        for i in 200..lutTable.len() {
+            lutTable[i] = 0xFFFF;
+        }
+
+        for i in 0..65535 {
+            lut_inverse_interp16(i, &lutTable);
+        }
+
+        // Lookup the interesting points
+
+        value = lut_inverse_interp16(0, &lutTable);
+        assert!(value <= 20 * 256);
+
+        value = lut_inverse_interp16(1, &lutTable);
+        assert!(value > 20 * 256);
+
+        value = lut_inverse_interp16(65535, &lutTable);
+        assert!(value < 201 * 256);
+    }
+
+    // this test takes to long to run on miri
+    #[cfg(not(miri))]
+    #[test]
+    fn test_lut_inverse_non_monotonic() {
+        // Make sure we behave sanely for non monotic functions
+        //   X  X  X
+        //  X  X  X
+        // X  X  X
+        let mut lutTable: [u16; 256] = [0; 256];
+
+        for i in 0..100 {
+            lutTable[i] = ((i - 0) * 0xFFFF / (100 - 0)) as u16;
+        }
+
+        for i in 100..200 {
+            lutTable[i] = ((i - 100) * 0xFFFF / (200 - 100)) as u16;
+        }
+
+        for i in 200..256 {
+            lutTable[i] = ((i - 200) * 0xFFFF / (256 - 200)) as u16;
+        }
+
+        for i in 0..65535 {
+            lut_inverse_interp16(i, &lutTable);
+        }
+
+        // Make sure we don't crash, hang or let sanitizers do their magic
+    }
+    /* qcms_data_create_rgb_with_gamma is broken
+    #[test]
+    fn profile_from_gamma() {
+
+        let white_point = qcms_CIE_xyY { x: 0.64, y: 0.33, Y: 1.};
+        let primaries = qcms_CIE_xyYTRIPLE {
+            red: qcms_CIE_xyY { x: 0.64, y: 0.33, Y: 1.},
+            green: qcms_CIE_xyY { x: 0.21, y: 0.71, Y: 1.},
+            blue: qcms_CIE_xyY { x: 0.15, y: 0.06, Y: 1.}
+        };
+        let mut mem: *mut libc::c_void = std::ptr::null_mut();
+        let mut size: size_t = 0;
+        unsafe { qcms_data_create_rgb_with_gamma(white_point, primaries, 2.2, &mut mem, &mut size); }
+        assert!(size != 0)
+    }
+    */
+
+    #[test]
+    fn alignment() {
+        assert_eq!(std::mem::align_of::<qcms_transform>(), 16);
+    }
+
+    #[test]
+    fn basic() {
+        let sRGB_profile = crate::c_bindings::qcms_profile_sRGB();
+
+        let Rec709Primaries = qcms_CIE_xyYTRIPLE {
+            red: qcms_CIE_xyY {
+                x: 0.6400f64,
+                y: 0.3300f64,
+                Y: 1.0f64,
+            },
+            green: qcms_CIE_xyY {
+                x: 0.3000f64,
+                y: 0.6000f64,
+                Y: 1.0f64,
+            },
+            blue: qcms_CIE_xyY {
+                x: 0.1500f64,
+                y: 0.0600f64,
+                Y: 1.0f64,
+            },
+        };
+        let D65 = qcms_white_point_sRGB();
+        let other = unsafe { qcms_profile_create_rgb_with_gamma(D65, Rec709Primaries, 2.2) };
+        unsafe { qcms_profile_precache_output_transform(&mut *other) };
+
+        let transform = unsafe {
+            qcms_transform_create(&mut *sRGB_profile, RGB8, &mut *other, RGB8, Perceptual)
+        };
+        let mut data: [u8; 120] = [0; 120];
+
+        unsafe {
+            qcms_transform_data(
+                &*transform,
+                data.as_ptr() as *const libc::c_void,
+                data.as_mut_ptr() as *mut libc::c_void,
+                data.len() / 3,
+            )
+        };
+
+        unsafe {
+            qcms_transform_release(transform);
+            qcms_profile_release(sRGB_profile);
+            qcms_profile_release(other);
+        }
+    }
+
+    #[test]
+    fn gray_alpha() {
+        let sRGB_profile = qcms_profile_sRGB();
+        let other = unsafe { qcms_profile_create_gray_with_gamma(2.2) };
+        unsafe { qcms_profile_precache_output_transform(&mut *other) };
+
+        let transform = unsafe {
+            qcms_transform_create(&mut *other, GrayA8, &mut *sRGB_profile, RGBA8, Perceptual)
+        };
+        assert!(!transform.is_null());
+
+        let in_data: [u8; 4] = [0, 255, 255, 0];
+        let mut out_data: [u8; 2 * 4] = [0; 8];
+        unsafe {
+            qcms_transform_data(
+                &*transform,
+                in_data.as_ptr() as *const libc::c_void,
+                out_data.as_mut_ptr() as *mut libc::c_void,
+                in_data.len() / 2,
+            )
+        };
+
+        assert_eq!(out_data, [0, 0, 0, 255, 255, 255, 255, 0]);
+        unsafe {
+            qcms_transform_release(transform);
+            qcms_profile_release(sRGB_profile);
+            qcms_profile_release(other);
+        }
+    }
+    #[test]
+    fn samples() {
+        use libc::c_void;
+        use std::io::Read;
+
+        let mut d = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+        qcms_enable_iccv4();
+        d.push("fuzz");
+        d.push("samples");
+        let samples = [
+            "0220-ca351238d719fd07ef8607d326b398fe.icc",
+            "0372-973178997787ee780b4b58ee47cad683.icc",
+            "0744-0a5faafe175e682b10c590b03d3f093b.icc",
+            "0316-eb3f97ab646cd7b66bee80bdfe6098ac.icc",
+            "0732-80707d91aea0f8e64ef0286cc7720e99.icc",
+            "1809-2bd4b77651214ca6110fdbee2502671e.icc",
+        ];
+        for s in samples.iter() {
+            let mut p = d.clone();
+            p.push(s);
+            let mut file = std::fs::File::open(p.clone()).unwrap();
+            let mut data = Vec::new();
+            file.read_to_end(&mut data).unwrap();
+            let profile =
+                unsafe { qcms_profile_from_memory(data.as_ptr() as *const c_void, data.len()) };
+            assert_ne!(profile, std::ptr::null_mut());
+            unsafe { qcms_profile_release(profile) };
+        }
+    }
+
+    #[test]
+    fn v4() {
+        use libc::c_void;
+        use std::io::Read;
+
+        let mut p = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+        qcms_enable_iccv4();
+        p.push("profiles");
+        // this profile was made by taking the lookup table profile from
+        // http://displaycal.net/icc-color-management-test/ and removing
+        // the unneeed tables using lcms
+        p.push("displaycal-lut-stripped.icc");
+
+        let mut file = std::fs::File::open(p).unwrap();
+        let mut data = Vec::new();
+        file.read_to_end(&mut data).unwrap();
+        let profile =
+            unsafe { qcms_profile_from_memory(data.as_ptr() as *const c_void, data.len()) };
+        assert_ne!(profile, std::ptr::null_mut());
+
+        let srgb_profile = qcms_profile_sRGB();
+        assert_ne!(srgb_profile, std::ptr::null_mut());
+
+        unsafe { qcms_profile_precache_output_transform(&mut *srgb_profile) };
+
+        let intent = unsafe { qcms_profile_get_rendering_intent(&*profile) };
+        let transform =
+            unsafe { qcms_transform_create(&*profile, RGB8, &*srgb_profile, RGB8, intent) };
+
+        assert_ne!(transform, std::ptr::null_mut());
+
+        const SRC_SIZE: usize = 4;
+        let src: [u8; SRC_SIZE * 3] = [
+            246, 246, 246, // gray
+            255, 0, 0, // red
+            0, 255, 255, // cyan
+            255, 255, 0, // yellow
+        ];
+        let mut dst: [u8; SRC_SIZE * 3] = [0; SRC_SIZE * 3];
+
+        // the reference values here should be adjusted if the accuracy
+        // of the transformation changes
+        let reference = [
+            246, 246, 246, // gray
+            255, 0, 0, // red
+            248, 14, 22, // red
+            0, 0, 255, // blue
+        ];
+
+        unsafe {
+            qcms_transform_data(
+                &*transform,
+                src.as_ptr() as *const libc::c_void,
+                dst.as_mut_ptr() as *mut libc::c_void,
+                SRC_SIZE,
+            );
+        }
+
+        assert_eq!(reference, dst);
+        unsafe { qcms_transform_release(transform) }
+        unsafe { qcms_profile_release(profile) }
+        unsafe { qcms_profile_release(srgb_profile) }
+    }
+
+    fn CmpRgbChannel(reference: &[u8], test: &[u8], index: usize) -> bool {
+        (reference[index] as i32 - test[index] as i32).abs() <= 1
+    }
+
+    fn CmpRgbBufferImpl(
+        refBuffer: &[u8],
+        testBuffer: &[u8],
+        pixels: usize,
+        kSwapRB: bool,
+        hasAlpha: bool,
+    ) -> bool {
+        let pixelSize = if hasAlpha { 4 } else { 3 };
+        if refBuffer[..pixels * pixelSize] == testBuffer[..pixels * pixelSize] {
+            return true;
+        }
+
+        let kRIndex = if kSwapRB { 2 } else { 0 };
+        let kGIndex = 1;
+        let kBIndex = if kSwapRB { 0 } else { 2 };
+        let kAIndex = 3;
+
+        let mut remaining = pixels;
+        let mut reference = &refBuffer[..];
+        let mut test = &testBuffer[..];
+        while remaining > 0 {
+            if !CmpRgbChannel(reference, test, kRIndex)
+                || !CmpRgbChannel(reference, test, kGIndex)
+                || !CmpRgbChannel(reference, test, kBIndex)
+                || (hasAlpha && reference[kAIndex] != test[kAIndex])
+            {
+                assert_eq!(test[kRIndex], reference[kRIndex]);
+                assert_eq!(test[kGIndex], reference[kGIndex]);
+                assert_eq!(test[kBIndex], reference[kBIndex]);
+                if hasAlpha {
+                    assert_eq!(test[kAIndex], reference[kAIndex]);
+                }
+                return false;
+            }
+            remaining -= 1;
+            reference = &reference[pixelSize..];
+            test = &test[pixelSize..];
+        }
+
+        true
+    }
+
+    fn GetRgbInputBufferImpl(kSwapRB: bool, kHasAlpha: bool) -> (usize, Vec<u8>) {
+        let colorSamples = [0, 5, 16, 43, 101, 127, 182, 255];
+        let colorSampleMax = colorSamples.len();
+        let pixelSize = if kHasAlpha { 4 } else { 3 };
+        let pixelCount = colorSampleMax * colorSampleMax * 256 * 3;
+
+        let mut outBuffer = vec![0; pixelCount * pixelSize];
+
+        let kRIndex = if kSwapRB { 2 } else { 0 };
+        let kGIndex = 1;
+        let kBIndex = if kSwapRB { 0 } else { 2 };
+        let kAIndex = 3;
+
+        // Sample every red pixel value with a subset of green and blue.
+        // we use a u16 for r to avoid https://github.com/rust-lang/rust/issues/78283
+        let mut color: &mut [u8] = &mut outBuffer[..];
+        for r in 0..=255u16 {
+            for &g in colorSamples.iter() {
+                for &b in colorSamples.iter() {
+                    color[kRIndex] = r as u8;
+                    color[kGIndex] = g;
+                    color[kBIndex] = b;
+                    if kHasAlpha {
+                        color[kAIndex] = 0x80;
+                    }
+                    color = &mut color[pixelSize..];
+                }
+            }
+        }
+
+        // Sample every green pixel value with a subset of red and blue.
+        let mut color = &mut outBuffer[..];
+        for &r in colorSamples.iter() {
+            for g in 0..=255u16 {
+                for &b in colorSamples.iter() {
+                    color[kRIndex] = r;
+                    color[kGIndex] = g as u8;
+                    color[kBIndex] = b;
+                    if kHasAlpha {
+                        color[kAIndex] = 0x80;
+                    }
+                    color = &mut color[pixelSize..];
+                }
+            }
+        }
+
+        // Sample every blue pixel value with a subset of red and green.
+        let mut color = &mut outBuffer[..];
+        for &r in colorSamples.iter() {
+            for &g in colorSamples.iter() {
+                for b in 0..=255u16 {
+                    color[kRIndex] = r;
+                    color[kGIndex] = g;
+                    color[kBIndex] = b as u8;
+                    if kHasAlpha {
+                        color[kAIndex] = 0x80;
+                    }
+                    color = &mut color[pixelSize..];
+                }
+            }
+        }
+
+        (pixelCount, outBuffer)
+    }
+
+    fn GetRgbInputBuffer() -> (usize, Vec<u8>) {
+        GetRgbInputBufferImpl(false, false)
+    }
+
+    fn GetRgbaInputBuffer() -> (usize, Vec<u8>) {
+        GetRgbInputBufferImpl(false, true)
+    }
+
+    fn GetBgraInputBuffer() -> (usize, Vec<u8>) {
+        GetRgbInputBufferImpl(true, true)
+    }
+
+    fn CmpRgbBuffer(refBuffer: &[u8], testBuffer: &[u8], pixels: usize) -> bool {
+        CmpRgbBufferImpl(refBuffer, testBuffer, pixels, false, false)
+    }
+
+    fn CmpRgbaBuffer(refBuffer: &[u8], testBuffer: &[u8], pixels: usize) -> bool {
+        CmpRgbBufferImpl(refBuffer, testBuffer, pixels, false, true)
+    }
+
+    fn CmpBgraBuffer(refBuffer: &[u8], testBuffer: &[u8], pixels: usize) -> bool {
+        CmpRgbBufferImpl(refBuffer, testBuffer, pixels, true, true)
+    }
+
+    fn ClearRgbBuffer(buffer: &mut [u8], pixels: usize) {
+        for i in 0..pixels * 3 {
+            buffer[i] = 0;
+        }
+    }
+
+    fn ClearRgbaBuffer(buffer: &mut [u8], pixels: usize) {
+        for i in 0..pixels * 4 {
+            buffer[i] = 0;
+        }
+    }
+
+    fn GetRgbOutputBuffer(pixels: usize) -> Vec<u8> {
+        vec![0; pixels * 3]
+    }
+
+    fn GetRgbaOutputBuffer(pixels: usize) -> Vec<u8> {
+        vec![0; pixels * 4]
+    }
+
+    struct QcmsProfileTest {
+        in_profile: *mut Profile,
+        out_profile: *mut Profile,
+        transform: *mut qcms_transform,
+
+        input: Vec<u8>,
+        output: Vec<u8>,
+        reference: Vec<u8>,
+
+        pixels: usize,
+        storage_type: DataType,
+        precache: bool,
+    }
+
+    impl QcmsProfileTest {
+        fn new() -> QcmsProfileTest {
+            QcmsProfileTest {
+                in_profile: null_mut(),
+                out_profile: null_mut(),
+                transform: null_mut(),
+                input: Vec::new(),
+                output: Vec::new(),
+                reference: Vec::new(),
+
+                pixels: 0,
+                storage_type: RGB8,
+                precache: false,
+            }
+        }
+
+        fn SetUp(&mut self) {
+            qcms_enable_iccv4();
+        }
+
+        unsafe fn TearDown(&mut self) {
+            if self.in_profile != null_mut() {
+                qcms_profile_release(self.in_profile)
+            }
+
+            if self.out_profile != null_mut() {
+                qcms_profile_release(self.out_profile)
+            }
+
+            if self.transform != null_mut() {
+                qcms_transform_release(self.transform)
+            }
+        }
+
+        unsafe fn SetTransform(&mut self, transform: *mut qcms_transform) -> bool {
+            if self.transform != null_mut() {
+                qcms_transform_release(self.transform)
+            }
+            self.transform = transform;
+            self.transform != null_mut()
+        }
+
+        unsafe fn SetTransformForType(&mut self, ty: DataType) -> bool {
+            self.SetTransform(qcms_transform_create(
+                &*self.in_profile,
+                ty,
+                &*self.out_profile,
+                ty,
+                Perceptual,
+            ))
+        }
+
+        unsafe fn SetBuffers(&mut self, ty: DataType) -> bool {
+            match ty {
+                RGB8 => {
+                    let (pixels, input) = GetRgbInputBuffer();
+                    self.input = input;
+                    self.pixels = pixels;
+                    self.reference = GetRgbOutputBuffer(self.pixels);
+                    self.output = GetRgbOutputBuffer(self.pixels)
+                }
+                RGBA8 => {
+                    let (pixels, input) = GetBgraInputBuffer();
+                    self.input = input;
+                    self.pixels = pixels;
+                    self.reference = GetRgbaOutputBuffer(self.pixels);
+                    self.output = GetRgbaOutputBuffer(self.pixels);
+                }
+                BGRA8 => {
+                    let (pixels, input) = GetRgbaInputBuffer();
+                    self.input = input;
+                    self.pixels = pixels;
+                    self.reference = GetRgbaOutputBuffer(self.pixels);
+                    self.output = GetRgbaOutputBuffer(self.pixels);
+                }
+                _ => unreachable!("Unknown type!"),
+            }
+            self.storage_type = ty;
+            self.pixels > 0
+        }
+
+        unsafe fn ClearOutputBuffer(&mut self) {
+            match self.storage_type {
+                RGB8 => ClearRgbBuffer(&mut self.output, self.pixels),
+                RGBA8 | BGRA8 => ClearRgbaBuffer(&mut self.output, self.pixels),
+                _ => unreachable!("Unknown type!"),
+            }
+        }
+
+        unsafe fn ProduceRef(&mut self, trans_fn: transform_fn_t) {
+            trans_fn.unwrap()(
+                &*self.transform,
+                self.input.as_mut_ptr(),
+                self.reference.as_mut_ptr(),
+                self.pixels,
+            )
+        }
+
+        fn CopyInputToRef(&mut self) {
+            let pixelSize = match self.storage_type {
+                RGB8 => 3,
+                RGBA8 | BGRA8 => 4,
+                _ => unreachable!("Unknown type!"),
+            };
+            self.reference
+                .copy_from_slice(&self.input[..self.pixels * pixelSize])
+        }
+
+        unsafe fn ProduceOutput(&mut self, trans_fn: transform_fn_t) {
+            self.ClearOutputBuffer();
+            trans_fn.unwrap()(
+                &*self.transform,
+                self.input.as_mut_ptr(),
+                self.output.as_mut_ptr(),
+                self.pixels,
+            )
+        }
+
+        unsafe fn VerifyOutput(&self, buf: &[u8]) -> bool {
+            match self.storage_type {
+                RGB8 => CmpRgbBuffer(buf, &self.output, self.pixels),
+                RGBA8 => CmpRgbaBuffer(buf, &self.output, self.pixels),
+                BGRA8 => CmpBgraBuffer(buf, &self.output, self.pixels),
+                _ => unreachable!("Unknown type!"),
+            }
+        }
+
+        unsafe fn ProduceVerifyOutput(&mut self, trans_fn: transform_fn_t) -> bool {
+            self.ProduceOutput(trans_fn);
+            self.VerifyOutput(&self.reference)
+        }
+
+        unsafe fn PrecacheOutput(&mut self) {
+            qcms_profile_precache_output_transform(&mut *self.out_profile);
+            self.precache = true;
+        }
+        unsafe fn TransformPrecache(&mut self) {
+            assert_eq!(self.precache, false);
+            assert!(self.SetBuffers(RGB8));
+            assert!(self.SetTransformForType(RGB8));
+            self.ProduceRef(Some(qcms_transform_data_rgb_out_lut));
+
+            self.PrecacheOutput();
+            assert!(self.SetTransformForType(RGB8));
+            assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_rgb_out_lut_precache)))
+        }
+
+        unsafe fn TransformPrecachePlatformExt(&mut self) {
+            self.PrecacheOutput();
+
+            // Verify RGB transforms.
+            assert!(self.SetBuffers(RGB8));
+            assert!(self.SetTransformForType(RGB8));
+            self.ProduceRef(Some(qcms_transform_data_rgb_out_lut_precache));
+
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            {
+                if is_x86_feature_detected!("sse2") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_rgb_out_lut_sse2)));
+                }
+                if is_x86_feature_detected!("avx") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_rgb_out_lut_avx)))
+                }
+            }
+
+            #[cfg(target_arch = "arm")]
+            {
+                if is_arm_feature_detected!("neon") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_rgb_out_lut_neon)))
+                }
+            }
+
+            #[cfg(target_arch = "aarch64")]
+            {
+                if is_aarch64_feature_detected!("neon") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_rgb_out_lut_neon)))
+                }
+            }
+
+            // Verify RGBA transform.
+            assert!(self.SetBuffers(RGBA8));
+            assert!(self.SetTransformForType(RGBA8));
+            self.ProduceRef(Some(qcms_transform_data_rgba_out_lut_precache));
+
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            {
+                if is_x86_feature_detected!("sse2") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_rgba_out_lut_sse2)));
+                }
+                if is_x86_feature_detected!("avx") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_rgba_out_lut_avx)))
+                }
+            }
+
+            #[cfg(target_arch = "arm")]
+            {
+                if is_arm_feature_detected!("neon") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_rgba_out_lut_neon)))
+                }
+            }
+
+            #[cfg(target_arch = "aarch64")]
+            {
+                if is_aarch64_feature_detected!("neon") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_rgba_out_lut_neon)))
+                }
+            }
+
+            // Verify BGRA transform.
+            assert!(self.SetBuffers(BGRA8));
+            assert!(self.SetTransformForType(BGRA8));
+            self.ProduceRef(Some(qcms_transform_data_bgra_out_lut_precache));
+
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            {
+                if is_x86_feature_detected!("sse2") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_bgra_out_lut_sse2)));
+                }
+                if is_x86_feature_detected!("avx") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_bgra_out_lut_avx)))
+                }
+            }
+
+            #[cfg(target_arch = "arm")]
+            {
+                if is_arm_feature_detected!("neon") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_bgra_out_lut_neon)))
+                }
+            }
+
+            #[cfg(target_arch = "aarch64")]
+            {
+                if is_aarch64_feature_detected!("neon") {
+                    assert!(self.ProduceVerifyOutput(Some(qcms_transform_data_bgra_out_lut_neon)))
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn sRGB_to_sRGB_precache() {
+        unsafe {
+            let mut pt = QcmsProfileTest::new();
+            pt.SetUp();
+            pt.in_profile = qcms_profile_sRGB();
+            pt.out_profile = qcms_profile_sRGB();
+            pt.TransformPrecache();
+            pt.TearDown();
+        }
+    }
+
+    #[test]
+    fn sRGB_to_sRGB_transform_identity() {
+        unsafe {
+            let mut pt = QcmsProfileTest::new();
+            pt.SetUp();
+            pt.in_profile = qcms_profile_sRGB();
+            pt.out_profile = qcms_profile_sRGB();
+            pt.PrecacheOutput();
+            pt.SetBuffers(RGB8);
+            pt.SetTransformForType(RGB8);
+            qcms_transform_data(
+                &*pt.transform,
+                pt.input.as_mut_ptr() as *mut c_void,
+                pt.output.as_mut_ptr() as *mut c_void,
+                pt.pixels,
+            );
+            assert!(pt.VerifyOutput(&pt.input));
+            pt.TearDown();
+        }
+    }
+
+    fn profile_from_path(file: &str) -> *mut Profile {
+        use std::io::Read;
+        let mut path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+        path.push("profiles");
+        path.push(file);
+        let mut file = std::fs::File::open(path).unwrap();
+        let mut data = Vec::new();
+        file.read_to_end(&mut data).unwrap();
+        let profile =
+            unsafe { qcms_profile_from_memory(data.as_ptr() as *const c_void, data.len()) };
+        assert_ne!(profile, std::ptr::null_mut());
+        profile
+    }
+
+    #[test]
+    fn sRGB_to_ThinkpadW540() {
+        unsafe {
+            let mut pt = QcmsProfileTest::new();
+            pt.SetUp();
+            pt.in_profile = qcms_profile_sRGB();
+            pt.out_profile = profile_from_path("lcms_thinkpad_w540.icc");
+            pt.TransformPrecachePlatformExt();
+            pt.TearDown();
+        }
+    }
+
+    #[test]
+    fn sRGB_to_SamsungSyncmaster() {
+        unsafe {
+            let mut pt = QcmsProfileTest::new();
+            pt.SetUp();
+            pt.in_profile = qcms_profile_sRGB();
+            pt.out_profile = profile_from_path("lcms_samsung_syncmaster.icc");
+            pt.TransformPrecachePlatformExt();
+            pt.TearDown();
+        }
+    }
+
+    #[test]
+    fn v4_output() {
+        qcms_enable_iccv4();
+        let input = qcms_profile_sRGB();
+        // B2A0-ident.icc was created from the profile in bug 1679621
+        // manually edited using iccToXML/iccFromXML
+        let output = profile_from_path("B2A0-ident.icc");
+
+        let transform = unsafe { qcms_transform_create(&*input, RGB8, &*output, RGB8, Perceptual) };
+        let src = [0u8, 60, 195];
+        let mut dst = [0u8, 0, 0];
+        unsafe {
+            qcms_transform_data(
+                &*transform,
+                src.as_ptr() as *const libc::c_void,
+                dst.as_mut_ptr() as *mut libc::c_void,
+                1,
+            );
+        }
+        assert_eq!(dst, [15, 16, 122]);
+        unsafe {
+            qcms_transform_release(transform);
+            qcms_profile_release(input);
+            qcms_profile_release(output);
+        }
+    }
+
+    #[test]
+    fn gray_smoke_test() {
+        let input = crate::Profile::new_gray_with_gamma(2.2);
+        let output = crate::Profile::new_sRGB();
+        let xfm =
+            transform_create(&input, GrayA8, &output, RGBA8, crate::Intent::default()).unwrap();
+        let src = [20u8, 20u8];
+        let mut dst = [0u8, 0, 0, 0];
+        unsafe {
+            qcms_transform_data(
+                &xfm,
+                src.as_ptr() as *const libc::c_void,
+                dst.as_mut_ptr() as *mut libc::c_void,
+                src.len() / GrayA8.bytes_per_pixel(),
+            );
+        }
+    }
+
+    #[test]
+    fn data_create_rgb_with_gamma() {
+        let Rec709Primaries = qcms_CIE_xyYTRIPLE {
+            red: {
+                qcms_CIE_xyY {
+                    x: 0.6400,
+                    y: 0.3300,
+                    Y: 1.0,
+                }
+            },
+            green: {
+                qcms_CIE_xyY {
+                    x: 0.3000,
+                    y: 0.6000,
+                    Y: 1.0,
+                }
+            },
+            blue: {
+                qcms_CIE_xyY {
+                    x: 0.1500,
+                    y: 0.0600,
+                    Y: 1.0,
+                }
+            },
+        };
+        let D65 = qcms_white_point_sRGB();
+        let mut mem = std::ptr::null_mut();
+        let mut size = 0;
+        unsafe {
+            qcms_data_create_rgb_with_gamma(D65, Rec709Primaries, 2.2, &mut mem, &mut size);
+        }
+        assert_ne!(size, 0);
+        unsafe { libc::free(mem) };
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::{Profile, Transform};
+    #[test]
+    fn identity() {
+        let p1 = Profile::new_sRGB();
+        let p2 = Profile::new_sRGB();
+        let xfm =
+            Transform::new(&p1, &p2, crate::DataType::RGB8, crate::Intent::default()).unwrap();
+        let mut data = [4, 30, 80];
+        xfm.apply(&mut data);
+        assert_eq!(data, [4, 30, 80]);
+    }
+    #[test]
+    fn D50() {
+        let p1 = Profile::new_sRGB();
+        let p2 = Profile::new_XYZD50();
+        let xfm =
+            Transform::new(&p1, &p2, crate::DataType::RGB8, crate::Intent::default()).unwrap();
+        let mut data = [4, 30, 80];
+        xfm.apply(&mut data);
+        assert_eq!(data, [4, 4, 15]);
+    }
+
+    fn profile_from_path(file: &str) -> Box<Profile> {
+        use std::io::Read;
+        let mut path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+        path.push("profiles");
+        path.push(file);
+        let mut file = std::fs::File::open(path).unwrap();
+        let mut data = Vec::new();
+        file.read_to_end(&mut data).unwrap();
+        Profile::new_from_slice(&data, false).unwrap()
+    }
+
+    #[test]
+    fn parametric_threshold() {
+        let src = profile_from_path("parametric-thresh.icc");
+        let dst = crate::Profile::new_sRGB();
+        let xfm =
+            Transform::new(&src, &dst, crate::DataType::RGB8, crate::Intent::default()).unwrap();
+        let mut data = [4, 30, 80];
+        xfm.apply(&mut data);
+        assert_eq!(data, [188, 188, 189]);
+    }
+
+    #[test]
+    fn cmyk() {
+        let input = profile_from_path("ps_cmyk_min.icc");
+        let output = Profile::new_sRGB();
+        let xfm = crate::Transform::new_to(
+            &input,
+            &output,
+            crate::DataType::CMYK,
+            crate::DataType::RGB8,
+            crate::Intent::default(),
+        )
+        .unwrap();
+        let src = [4, 30, 80, 10];
+        let mut dst = [0, 0, 0];
+        xfm.convert(&src, &mut dst);
+        assert_eq!(dst, [252, 237, 211]);
+    }
+
+    #[test]
+    fn sRGB_parametric() {
+        let src = Profile::new_sRGB();
+        let dst = Profile::new_sRGB_parametric();
+        let xfm =
+            Transform::new(&src, &dst, crate::DataType::RGB8, crate::Intent::default()).unwrap();
+        let mut data = [4, 30, 80];
+        xfm.apply(&mut data);
+        assert_eq!(data, [4, 30, 80]);
+    }
+}
diff --git a/gfx/qcms/src/iccread.rs b/gfx/qcms/src/iccread.rs
new file mode 100644
index 0000000000..d86e9742d4
--- /dev/null
+++ b/gfx/qcms/src/iccread.rs
@@ -0,0 +1,1718 @@
+//  qcms
+//  Copyright (C) 2009 Mozilla Foundation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+use std::{
+    convert::{TryInto, TryFrom},
+    sync::atomic::AtomicBool,
+    sync::Arc,
+};
+
+use crate::{
+    double_to_s15Fixed16Number,
+    transform::{set_rgb_colorants, PrecacheOuput},
+};
+use crate::{matrix::Matrix, s15Fixed16Number, s15Fixed16Number_to_float, Intent, Intent::*};
+
+pub static SUPPORTS_ICCV4: AtomicBool = AtomicBool::new(cfg!(feature = "iccv4-enabled"));
+
+pub const RGB_SIGNATURE: u32 = 0x52474220;
+pub const GRAY_SIGNATURE: u32 = 0x47524159;
+pub const XYZ_SIGNATURE: u32 = 0x58595A20;
+pub const LAB_SIGNATURE: u32 = 0x4C616220;
+pub const CMYK_SIGNATURE: u32 = 0x434D594B; // 'CMYK'
+
+/// A color profile
+#[derive(Default, Debug)]
+pub struct Profile {
+    pub(crate) class_type: u32,
+    pub(crate) color_space: u32,
+    pub(crate) pcs: u32,
+    pub(crate) rendering_intent: Intent,
+    pub(crate) redColorant: XYZNumber,
+    pub(crate) blueColorant: XYZNumber,
+    pub(crate) greenColorant: XYZNumber,
+    // "TRC" is EOTF, e.g. gamma->linear transfer function.
+    // Because ICC profiles are phrased as decodings to the xyzd50-linear PCS.
+    pub(crate) redTRC: Option<Box<curveType>>,
+    pub(crate) blueTRC: Option<Box<curveType>>,
+    pub(crate) greenTRC: Option<Box<curveType>>,
+    pub(crate) grayTRC: Option<Box<curveType>>,
+    pub(crate) A2B0: Option<Box<lutType>>,
+    pub(crate) B2A0: Option<Box<lutType>>,
+    pub(crate) mAB: Option<Box<lutmABType>>,
+    pub(crate) mBA: Option<Box<lutmABType>>,
+    pub(crate) chromaticAdaption: Option<Matrix>,
+    pub(crate) output_table_r: Option<Arc<PrecacheOuput>>,
+    pub(crate) output_table_g: Option<Arc<PrecacheOuput>>,
+    pub(crate) output_table_b: Option<Arc<PrecacheOuput>>,
+    is_srgb: bool,
+}
+
+#[derive(Debug, Default)]
+#[allow(clippy::upper_case_acronyms)]
+pub(crate) struct lutmABType {
+    pub num_in_channels: u8,
+    pub num_out_channels: u8,
+    // 16 is the upperbound, actual is 0..num_in_channels.
+    pub num_grid_points: [u8; 16],
+    pub e00: s15Fixed16Number,
+    pub e01: s15Fixed16Number,
+    pub e02: s15Fixed16Number,
+    pub e03: s15Fixed16Number,
+    pub e10: s15Fixed16Number,
+    pub e11: s15Fixed16Number,
+    pub e12: s15Fixed16Number,
+    pub e13: s15Fixed16Number,
+    pub e20: s15Fixed16Number,
+    pub e21: s15Fixed16Number,
+    pub e22: s15Fixed16Number,
+    pub e23: s15Fixed16Number,
+    // reversed elements (for mBA)
+    pub reversed: bool,
+    pub clut_table: Option<Vec<f32>>,
+    pub a_curves: [Option<Box<curveType>>; MAX_CHANNELS],
+    pub b_curves: [Option<Box<curveType>>; MAX_CHANNELS],
+    pub m_curves: [Option<Box<curveType>>; MAX_CHANNELS],
+}
+#[derive(Clone, Debug)]
+pub(crate) enum curveType {
+    Curve(Vec<uInt16Number>), // len=0 => Linear, len=1 => Gamma(v[0]), _ => lut
+    /// The ICC parametricCurveType is specified in terms of s15Fixed16Number,
+    /// so it's possible to use this variant to specify greater precision than
+    /// any raw ICC profile could
+    Parametric(Vec<f32>),
+}
+type uInt16Number = u16;
+
+/* should lut8Type and lut16Type be different types? */
+#[derive(Debug)]
+pub(crate) struct lutType {
+    // used by lut8Type/lut16Type (mft2) only
+    pub num_input_channels: u8,
+    pub num_output_channels: u8,
+    pub num_clut_grid_points: u8,
+    pub e00: s15Fixed16Number,
+    pub e01: s15Fixed16Number,
+    pub e02: s15Fixed16Number,
+    pub e10: s15Fixed16Number,
+    pub e11: s15Fixed16Number,
+    pub e12: s15Fixed16Number,
+    pub e20: s15Fixed16Number,
+    pub e21: s15Fixed16Number,
+    pub e22: s15Fixed16Number,
+    pub num_input_table_entries: u16,
+    pub num_output_table_entries: u16,
+    pub input_table: Vec<f32>,
+    pub clut_table: Vec<f32>,
+    pub output_table: Vec<f32>,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug, Default)]
+#[allow(clippy::upper_case_acronyms)]
+pub struct XYZNumber {
+    pub X: s15Fixed16Number,
+    pub Y: s15Fixed16Number,
+    pub Z: s15Fixed16Number,
+}
+
+/// A color in the CIE xyY color space
+/* the names for the following two types are sort of ugly */
+#[repr(C)]
+#[derive(Copy, Clone)]
+#[allow(clippy::upper_case_acronyms)]
+pub struct qcms_CIE_xyY {
+    pub x: f64,
+    pub y: f64,
+    pub Y: f64,
+}
+
+/// A more convenient type for specifying primaries and white points where
+/// luminosity is irrelevant
+struct qcms_chromaticity {
+    x: f64,
+    y: f64,
+}
+
+impl qcms_chromaticity {
+    const D65: Self = Self {
+        x: 0.3127,
+        y: 0.3290,
+    };
+}
+
+impl From<qcms_chromaticity> for qcms_CIE_xyY {
+    fn from(qcms_chromaticity { x, y }: qcms_chromaticity) -> Self {
+        Self { x, y, Y: 1.0 }
+    }
+}
+
+/// a set of CIE_xyY values that can use to describe the primaries of a color space
+#[repr(C)]
+#[derive(Copy, Clone)]
+#[allow(clippy::upper_case_acronyms)]
+pub struct qcms_CIE_xyYTRIPLE {
+    pub red: qcms_CIE_xyY,
+    pub green: qcms_CIE_xyY,
+    pub blue: qcms_CIE_xyY,
+}
+
+struct Tag {
+    signature: u32,
+    offset: u32,
+    size: u32,
+}
+
+/* It might be worth having a unified limit on content controlled
+ * allocation per profile. This would remove the need for many
+ * of the arbitrary limits that we used */
+
+type TagIndex = [Tag];
+
+/* a wrapper around the memory that we are going to parse
+ * into a qcms_profile */
+struct MemSource<'a> {
+    buf: &'a [u8],
+    valid: bool,
+    invalid_reason: Option<&'static str>,
+}
+pub type uInt8Number = u8;
+#[inline]
+fn uInt8Number_to_float(a: uInt8Number) -> f32 {
+    a as f32 / 255.0
+}
+
+#[inline]
+fn uInt16Number_to_float(a: uInt16Number) -> f32 {
+    a as f32 / 65535.0
+}
+
+fn invalid_source(mut mem: &mut MemSource, reason: &'static str) {
+    mem.valid = false;
+    mem.invalid_reason = Some(reason);
+}
+fn read_u32(mem: &mut MemSource, offset: usize) -> u32 {
+    let val = mem.buf.get(offset..offset + 4);
+    if let Some(val) = val {
+        let val = val.try_into().unwrap();
+        u32::from_be_bytes(val)
+    } else {
+        invalid_source(mem, "Invalid offset");
+        0
+    }
+}
+fn read_u16(mem: &mut MemSource, offset: usize) -> u16 {
+    let val = mem.buf.get(offset..offset + 2);
+    if let Some(val) = val {
+        let val = val.try_into().unwrap();
+        u16::from_be_bytes(val)
+    } else {
+        invalid_source(mem, "Invalid offset");
+        0
+    }
+}
+fn read_u8(mem: &mut MemSource, offset: usize) -> u8 {
+    let val = mem.buf.get(offset);
+    if let Some(val) = val {
+        *val
+    } else {
+        invalid_source(mem, "Invalid offset");
+        0
+    }
+}
+fn read_s15Fixed16Number(mem: &mut MemSource, offset: usize) -> s15Fixed16Number {
+    read_u32(mem, offset) as s15Fixed16Number
+}
+fn read_uInt8Number(mem: &mut MemSource, offset: usize) -> uInt8Number {
+    read_u8(mem, offset)
+}
+fn read_uInt16Number(mem: &mut MemSource, offset: usize) -> uInt16Number {
+    read_u16(mem, offset)
+}
+pub fn write_u32(mem: &mut [u8], offset: usize, value: u32) {
+    // we use get() and expect() instead of [..] so there's only one call to panic
+    // instead of two
+    mem.get_mut(offset..offset + std::mem::size_of_val(&value))
+        .expect("OOB")
+        .copy_from_slice(&value.to_be_bytes());
+}
+pub fn write_u16(mem: &mut [u8], offset: usize, value: u16) {
+    // we use get() and expect() instead of [..] so there's only one call to panic
+    // intead of two
+    mem.get_mut(offset..offset + std::mem::size_of_val(&value))
+        .expect("OOB")
+        .copy_from_slice(&value.to_be_bytes());
+}
+
+/* An arbitrary 4MB limit on profile size */
+pub(crate) const MAX_PROFILE_SIZE: usize = 1024 * 1024 * 4;
+const MAX_TAG_COUNT: u32 = 1024;
+
+fn check_CMM_type_signature(_src: &mut MemSource) {
+    //uint32_t CMM_type_signature = read_u32(src, 4);
+    //TODO: do the check?
+}
+fn check_profile_version(src: &mut MemSource) {
+    /*
+    uint8_t major_revision = read_u8(src, 8 + 0);
+    uint8_t minor_revision = read_u8(src, 8 + 1);
+    */
+    let reserved1: u8 = read_u8(src, (8 + 2) as usize);
+    let reserved2: u8 = read_u8(src, (8 + 3) as usize);
+    /* Checking the version doesn't buy us anything
+    if (major_revision != 0x4) {
+        if (major_revision > 0x2)
+            invalid_source(src, "Unsupported major revision");
+        if (minor_revision > 0x40)
+            invalid_source(src, "Unsupported minor revision");
+    }
+    */
+    if reserved1 != 0 || reserved2 != 0 {
+        invalid_source(src, "Invalid reserved bytes");
+    };
+}
+
+const INPUT_DEVICE_PROFILE: u32 = 0x73636e72; // 'scnr'
+pub const DISPLAY_DEVICE_PROFILE: u32 = 0x6d6e7472; // 'mntr'
+const OUTPUT_DEVICE_PROFILE: u32 = 0x70727472; // 'prtr'
+const DEVICE_LINK_PROFILE: u32 = 0x6c696e6b; // 'link'
+const COLOR_SPACE_PROFILE: u32 = 0x73706163; // 'spac'
+const ABSTRACT_PROFILE: u32 = 0x61627374; // 'abst'
+const NAMED_COLOR_PROFILE: u32 = 0x6e6d636c; // 'nmcl'
+
+fn read_class_signature(mut profile: &mut Profile, mem: &mut MemSource) {
+    profile.class_type = read_u32(mem, 12);
+    match profile.class_type {
+        DISPLAY_DEVICE_PROFILE
+        | INPUT_DEVICE_PROFILE
+        | OUTPUT_DEVICE_PROFILE
+        | COLOR_SPACE_PROFILE => {}
+        _ => {
+            invalid_source(mem, "Invalid  Profile/Device Class signature");
+        }
+    };
+}
+fn read_color_space(mut profile: &mut Profile, mem: &mut MemSource) {
+    profile.color_space = read_u32(mem, 16);
+    match profile.color_space {
+        RGB_SIGNATURE | GRAY_SIGNATURE => {}
+        #[cfg(feature = "cmyk")]
+        CMYK_SIGNATURE => {}
+        _ => {
+            invalid_source(mem, "Unsupported colorspace");
+        }
+    };
+}
+fn read_pcs(mut profile: &mut Profile, mem: &mut MemSource) {
+    profile.pcs = read_u32(mem, 20);
+    match profile.pcs {
+        XYZ_SIGNATURE | LAB_SIGNATURE => {}
+        _ => {
+            invalid_source(mem, "Unsupported pcs");
+        }
+    };
+}
+fn read_tag_table(_profile: &mut Profile, mem: &mut MemSource) -> Vec<Tag> {
+    let count = read_u32(mem, 128);
+    if count > MAX_TAG_COUNT {
+        invalid_source(mem, "max number of tags exceeded");
+        return Vec::new();
+    }
+    let mut index = Vec::with_capacity(count as usize);
+    for i in 0..count {
+        let tag_start = (128 + 4 + 4 * i * 3) as usize;
+        let offset = read_u32(mem, tag_start + 4);
+        if offset as usize > mem.buf.len() {
+            invalid_source(mem, "tag points beyond the end of the buffer");
+        }
+        index.push(Tag {
+            signature: read_u32(mem, tag_start),
+            offset,
+            size: read_u32(mem, tag_start + 8),
+        });
+    }
+
+    index
+}
+
+/// Checks a profile for obvious inconsistencies and returns
+/// true if the profile looks bogus and should probably be
+/// ignored.
+#[no_mangle]
+pub extern "C" fn qcms_profile_is_bogus(profile: &mut Profile) -> bool {
+    let mut sum: [f32; 3] = [0.; 3];
+    let mut target: [f32; 3] = [0.; 3];
+    let mut tolerance: [f32; 3] = [0.; 3];
+    let rX: f32;
+    let rY: f32;
+    let rZ: f32;
+    let gX: f32;
+    let gY: f32;
+    let gZ: f32;
+    let bX: f32;
+    let bY: f32;
+    let bZ: f32;
+    let negative: bool;
+    let mut i: u32;
+    // We currently only check the bogosity of RGB profiles
+    if profile.color_space != RGB_SIGNATURE {
+        return false;
+    }
+    if profile.A2B0.is_some()
+        || profile.B2A0.is_some()
+        || profile.mAB.is_some()
+        || profile.mBA.is_some()
+    {
+        return false;
+    }
+    rX = s15Fixed16Number_to_float(profile.redColorant.X);
+    rY = s15Fixed16Number_to_float(profile.redColorant.Y);
+    rZ = s15Fixed16Number_to_float(profile.redColorant.Z);
+    gX = s15Fixed16Number_to_float(profile.greenColorant.X);
+    gY = s15Fixed16Number_to_float(profile.greenColorant.Y);
+    gZ = s15Fixed16Number_to_float(profile.greenColorant.Z);
+    bX = s15Fixed16Number_to_float(profile.blueColorant.X);
+    bY = s15Fixed16Number_to_float(profile.blueColorant.Y);
+    bZ = s15Fixed16Number_to_float(profile.blueColorant.Z);
+    // Sum the values; they should add up to something close to white
+    sum[0] = rX + gX + bX;
+    sum[1] = rY + gY + bY;
+    sum[2] = rZ + gZ + bZ;
+    // Build our target vector (see mozilla bug 460629)
+    target[0] = 0.96420;
+    target[1] = 1.00000;
+    target[2] = 0.82491;
+    // Our tolerance vector - Recommended by Chris Murphy based on
+    // conversion from the LAB space criterion of no more than 3 in any one
+    // channel. This is similar to, but slightly more tolerant than Adobe's
+    // criterion.
+    tolerance[0] = 0.02;
+    tolerance[1] = 0.02;
+    tolerance[2] = 0.04;
+    // Compare with our tolerance
+    i = 0;
+    while i < 3 {
+        if !(sum[i as usize] - tolerance[i as usize] <= target[i as usize]
+            && sum[i as usize] + tolerance[i as usize] >= target[i as usize])
+        {
+            return true;
+        }
+        i += 1
+    }
+    if false {
+        negative = (rX < 0.)
+            || (rY < 0.)
+            || (rZ < 0.)
+            || (gX < 0.)
+            || (gY < 0.)
+            || (gZ < 0.)
+            || (bX < 0.)
+            || (bY < 0.)
+            || (bZ < 0.);
+    } else {
+        // Chromatic adaption to D50 can result in negative XYZ, but the white
+        // point D50 tolerance test has passed. Accept negative values herein.
+        // See https://bugzilla.mozilla.org/show_bug.cgi?id=498245#c18 onwards
+        // for discussion about whether profile XYZ can or cannot be negative,
+        // per the spec. Also the https://bugzil.la/450923 user report.
+        // Also: https://bugzil.la/1799391 and https://bugzil.la/1792469
+        negative = false; // bogus
+    }
+    if negative {
+        return true;
+    }
+    // All Good
+    false
+}
+
+pub const TAG_bXYZ: u32 = 0x6258595a;
+pub const TAG_gXYZ: u32 = 0x6758595a;
+pub const TAG_rXYZ: u32 = 0x7258595a;
+pub const TAG_rTRC: u32 = 0x72545243;
+pub const TAG_bTRC: u32 = 0x62545243;
+pub const TAG_gTRC: u32 = 0x67545243;
+pub const TAG_kTRC: u32 = 0x6b545243;
+pub const TAG_A2B0: u32 = 0x41324230;
+pub const TAG_B2A0: u32 = 0x42324130;
+pub const TAG_CHAD: u32 = 0x63686164;
+
+fn find_tag(index: &TagIndex, tag_id: u32) -> Option<&Tag> {
+    for t in index {
+        if t.signature == tag_id {
+            return Some(t);
+        }
+    }
+    None
+}
+
+pub const XYZ_TYPE: u32 = 0x58595a20; // 'XYZ '
+pub const CURVE_TYPE: u32 = 0x63757276; // 'curv'
+pub const PARAMETRIC_CURVE_TYPE: u32 = 0x70617261; // 'para'
+pub const LUT16_TYPE: u32 = 0x6d667432; // 'mft2'
+pub const LUT8_TYPE: u32 = 0x6d667431; // 'mft1'
+pub const LUT_MAB_TYPE: u32 = 0x6d414220; // 'mAB '
+pub const LUT_MBA_TYPE: u32 = 0x6d424120; // 'mBA '
+pub const CHROMATIC_TYPE: u32 = 0x73663332; // 'sf32'
+
+fn read_tag_s15Fixed16ArrayType(src: &mut MemSource, tag: &Tag) -> Matrix {
+    let mut matrix: Matrix = Matrix { m: [[0.; 3]; 3] };
+    let offset: u32 = tag.offset;
+    let type_0: u32 = read_u32(src, offset as usize);
+    // Check mandatory type signature for s16Fixed16ArrayType
+    if type_0 != CHROMATIC_TYPE {
+        invalid_source(src, "unexpected type, expected \'sf32\'");
+    }
+    for i in 0..=8 {
+        matrix.m[(i / 3) as usize][(i % 3) as usize] = s15Fixed16Number_to_float(
+            read_s15Fixed16Number(src, (offset + 8 + (i * 4) as u32) as usize),
+        );
+    }
+    matrix
+}
+fn read_tag_XYZType(src: &mut MemSource, index: &TagIndex, tag_id: u32) -> XYZNumber {
+    let mut num = XYZNumber { X: 0, Y: 0, Z: 0 };
+    let tag = find_tag(&index, tag_id);
+    if let Some(tag) = tag {
+        let offset: u32 = tag.offset;
+        let type_0: u32 = read_u32(src, offset as usize);
+        if type_0 != XYZ_TYPE {
+            invalid_source(src, "unexpected type, expected XYZ");
+        }
+        num.X = read_s15Fixed16Number(src, (offset + 8) as usize);
+        num.Y = read_s15Fixed16Number(src, (offset + 12) as usize);
+        num.Z = read_s15Fixed16Number(src, (offset + 16) as usize)
+    } else {
+        invalid_source(src, "missing xyztag");
+    }
+    num
+}
+// Read the tag at a given offset rather then the tag_index.
+// This method is used when reading mAB tags where nested curveType are
+// present that are not part of the tag_index.
+fn read_curveType(src: &mut MemSource, offset: u32, len: &mut u32) -> Option<Box<curveType>> {
+    const COUNT_TO_LENGTH: [u32; 5] = [1, 3, 4, 5, 7]; //PARAMETRIC_CURVE_TYPE
+    let type_0: u32 = read_u32(src, offset as usize);
+    let count: u32;
+    if type_0 != CURVE_TYPE && type_0 != PARAMETRIC_CURVE_TYPE {
+        invalid_source(src, "unexpected type, expected CURV or PARA");
+        return None;
+    }
+    if type_0 == CURVE_TYPE {
+        count = read_u32(src, (offset + 8) as usize);
+        //arbitrary
+        if count > 40000 {
+            invalid_source(src, "curve size too large");
+            return None;
+        }
+        let mut table = Vec::with_capacity(count as usize);
+        for i in 0..count {
+            table.push(read_u16(src, (offset + 12 + i * 2) as usize));
+        }
+        *len = 12 + count * 2;
+        Some(Box::new(curveType::Curve(table)))
+    } else {
+        count = read_u16(src, (offset + 8) as usize) as u32;
+        if count > 4 {
+            invalid_source(src, "parametric function type not supported.");
+            return None;
+        }
+        let mut params = Vec::with_capacity(count as usize);
+        for i in 0..COUNT_TO_LENGTH[count as usize] {
+            params.push(s15Fixed16Number_to_float(read_s15Fixed16Number(
+                src,
+                (offset + 12 + i * 4) as usize,
+            )));
+        }
+        *len = 12 + COUNT_TO_LENGTH[count as usize] * 4;
+        if count == 1 || count == 2 {
+            /* we have a type 1 or type 2 function that has a division by 'a' */
+            let a: f32 = params[1];
+            if a == 0.0 {
+                invalid_source(src, "parametricCurve definition causes division by zero");
+            }
+        }
+        Some(Box::new(curveType::Parametric(params)))
+    }
+}
+fn read_tag_curveType(
+    src: &mut MemSource,
+    index: &TagIndex,
+    tag_id: u32,
+) -> Option<Box<curveType>> {
+    let tag = find_tag(index, tag_id);
+    if let Some(tag) = tag {
+        let mut len: u32 = 0;
+        return read_curveType(src, tag.offset, &mut len);
+    } else {
+        invalid_source(src, "missing curvetag");
+    }
+    None
+}
+
+const MAX_LUT_SIZE: u32 = 500000; // arbitrary
+const MAX_CHANNELS: usize = 10; // arbitrary
+fn read_nested_curveType(
+    src: &mut MemSource,
+    curveArray: &mut [Option<Box<curveType>>; MAX_CHANNELS],
+    num_channels: u8,
+    curve_offset: u32,
+) {
+    let mut channel_offset: u32 = 0;
+    #[allow(clippy::needless_range_loop)]
+    for i in 0..usize::from(num_channels) {
+        let mut tag_len: u32 = 0;
+        curveArray[i] = read_curveType(src, curve_offset + channel_offset, &mut tag_len);
+        if curveArray[i].is_none() {
+            invalid_source(src, "invalid nested curveType curve");
+            break;
+        } else {
+            channel_offset += tag_len;
+            // 4 byte aligned
+            if tag_len % 4 != 0 {
+                channel_offset += 4 - tag_len % 4
+            }
+        }
+    }
+}
+
+/* See section 10.10 for specs */
+fn read_tag_lutmABType(src: &mut MemSource, tag: &Tag) -> Option<Box<lutmABType>> {
+    let offset: u32 = tag.offset;
+    let mut clut_size: u32 = 1;
+    let type_0: u32 = read_u32(src, offset as usize);
+    if type_0 != LUT_MAB_TYPE && type_0 != LUT_MBA_TYPE {
+        return None;
+    }
+    let num_in_channels = read_u8(src, (offset + 8) as usize);
+    let num_out_channels = read_u8(src, (offset + 9) as usize);
+    if num_in_channels > 10 || num_out_channels > 10 {
+        return None;
+    }
+    // We require 3in/out channels since we only support RGB->XYZ (or RGB->LAB)
+    // XXX: If we remove this restriction make sure that the number of channels
+    //      is less or equal to the maximum number of mAB curves in qcmsint.h
+    //      also check for clut_size overflow. Also make sure it's != 0
+    if num_in_channels != 3 || num_out_channels != 3 {
+        return None;
+    }
+    // some of this data is optional and is denoted by a zero offset
+    // we also use this to track their existance
+    let mut a_curve_offset = read_u32(src, (offset + 28) as usize);
+    let mut clut_offset = read_u32(src, (offset + 24) as usize);
+    let mut m_curve_offset = read_u32(src, (offset + 20) as usize);
+    let mut matrix_offset = read_u32(src, (offset + 16) as usize);
+    let mut b_curve_offset = read_u32(src, (offset + 12) as usize);
+    // Convert offsets relative to the tag to relative to the profile
+    // preserve zero for optional fields
+    if a_curve_offset != 0 {
+        a_curve_offset += offset
+    }
+    if clut_offset != 0 {
+        clut_offset += offset
+    }
+    if m_curve_offset != 0 {
+        m_curve_offset += offset
+    }
+    if matrix_offset != 0 {
+        matrix_offset += offset
+    }
+    if b_curve_offset != 0 {
+        b_curve_offset += offset
+    }
+    if clut_offset != 0 {
+        debug_assert!(num_in_channels == 3);
+        // clut_size can not overflow since lg(256^num_in_channels) = 24 bits.
+        for i in 0..u32::from(num_in_channels) {
+            clut_size *= read_u8(src, (clut_offset + i) as usize) as u32;
+            if clut_size == 0 {
+                invalid_source(src, "bad clut_size");
+            }
+        }
+    } else {
+        clut_size = 0
+    }
+    // 24bits * 3 won't overflow either
+    clut_size *= num_out_channels as u32;
+    if clut_size > MAX_LUT_SIZE {
+        return None;
+    }
+
+    let mut lut = Box::new(lutmABType::default());
+
+    if clut_offset != 0 {
+        for i in 0..usize::from(num_in_channels) {
+            lut.num_grid_points[i] = read_u8(src, clut_offset as usize + i);
+            if lut.num_grid_points[i] == 0 {
+                invalid_source(src, "bad grid_points");
+            }
+        }
+    }
+    // Reverse the processing of transformation elements for mBA type.
+    lut.reversed = type_0 == LUT_MBA_TYPE;
+    lut.num_in_channels = num_in_channels;
+    lut.num_out_channels = num_out_channels;
+    #[allow(clippy::identity_op, clippy::erasing_op)]
+    if matrix_offset != 0 {
+        // read the matrix if we have it
+        lut.e00 = read_s15Fixed16Number(src, (matrix_offset + (4 * 0) as u32) as usize); // the caller checks that this doesn't happen
+        lut.e01 = read_s15Fixed16Number(src, (matrix_offset + (4 * 1) as u32) as usize);
+        lut.e02 = read_s15Fixed16Number(src, (matrix_offset + (4 * 2) as u32) as usize);
+        lut.e10 = read_s15Fixed16Number(src, (matrix_offset + (4 * 3) as u32) as usize);
+        lut.e11 = read_s15Fixed16Number(src, (matrix_offset + (4 * 4) as u32) as usize);
+        lut.e12 = read_s15Fixed16Number(src, (matrix_offset + (4 * 5) as u32) as usize);
+        lut.e20 = read_s15Fixed16Number(src, (matrix_offset + (4 * 6) as u32) as usize);
+        lut.e21 = read_s15Fixed16Number(src, (matrix_offset + (4 * 7) as u32) as usize);
+        lut.e22 = read_s15Fixed16Number(src, (matrix_offset + (4 * 8) as u32) as usize);
+        lut.e03 = read_s15Fixed16Number(src, (matrix_offset + (4 * 9) as u32) as usize);
+        lut.e13 = read_s15Fixed16Number(src, (matrix_offset + (4 * 10) as u32) as usize);
+        lut.e23 = read_s15Fixed16Number(src, (matrix_offset + (4 * 11) as u32) as usize)
+    }
+    if a_curve_offset != 0 {
+        read_nested_curveType(src, &mut lut.a_curves, num_in_channels, a_curve_offset);
+    }
+    if m_curve_offset != 0 {
+        read_nested_curveType(src, &mut lut.m_curves, num_out_channels, m_curve_offset);
+    }
+    if b_curve_offset != 0 {
+        read_nested_curveType(src, &mut lut.b_curves, num_out_channels, b_curve_offset);
+    } else {
+        invalid_source(src, "B curves required");
+    }
+    if clut_offset != 0 {
+        let clut_precision = read_u8(src, (clut_offset + 16) as usize);
+        let mut clut_table = Vec::with_capacity(clut_size as usize);
+        if clut_precision == 1 {
+            for i in 0..clut_size {
+                clut_table.push(uInt8Number_to_float(read_uInt8Number(
+                    src,
+                    (clut_offset + 20 + i) as usize,
+                )));
+            }
+            lut.clut_table = Some(clut_table);
+        } else if clut_precision == 2 {
+            for i in 0..clut_size {
+                clut_table.push(uInt16Number_to_float(read_uInt16Number(
+                    src,
+                    (clut_offset + 20 + i * 2) as usize,
+                )));
+            }
+            lut.clut_table = Some(clut_table);
+        } else {
+            invalid_source(src, "Invalid clut precision");
+        }
+    }
+    if !src.valid {
+        return None;
+    }
+    Some(lut)
+}
+fn read_tag_lutType(src: &mut MemSource, tag: &Tag) -> Option<Box<lutType>> {
+    let offset: u32 = tag.offset;
+    let type_0: u32 = read_u32(src, offset as usize);
+    let num_input_table_entries: u16;
+    let num_output_table_entries: u16;
+    let input_offset: u32;
+    let entry_size: usize;
+    if type_0 == LUT8_TYPE {
+        num_input_table_entries = 256u16;
+        num_output_table_entries = 256u16;
+        entry_size = 1;
+        input_offset = 48
+    } else if type_0 == LUT16_TYPE {
+        num_input_table_entries = read_u16(src, (offset + 48) as usize);
+        num_output_table_entries = read_u16(src, (offset + 50) as usize);
+
+        // these limits come from the spec
+        if !(2..=4096).contains(&num_input_table_entries)
+            || !(2..=4096).contains(&num_output_table_entries)
+        {
+            invalid_source(src, "Bad channel count");
+            return None;
+        }
+        entry_size = 2;
+        input_offset = 52
+    } else {
+        debug_assert!(false);
+        invalid_source(src, "Unexpected lut type");
+        return None;
+    }
+    let in_chan = read_u8(src, (offset + 8) as usize);
+    let out_chan = read_u8(src, (offset + 9) as usize);
+    if !(in_chan == 3 || in_chan == 4) || out_chan != 3 {
+        invalid_source(src, "CLUT only supports RGB and CMYK");
+        return None;
+    }
+
+    let grid_points = read_u8(src, (offset + 10) as usize);
+    let clut_size = match (grid_points as u32).checked_pow(in_chan as u32) {
+        Some(clut_size) => clut_size,
+        _ => {
+            invalid_source(src, "CLUT size overflow");
+            return None;
+        }
+    };
+    match clut_size {
+        1..=MAX_LUT_SIZE => {} // OK
+        0 => {
+            invalid_source(src, "CLUT must not be empty.");
+            return None;
+        }
+        _ => {
+            invalid_source(src, "CLUT too large");
+            return None;
+        }
+    }
+
+    let e00 = read_s15Fixed16Number(src, (offset + 12) as usize);
+    let e01 = read_s15Fixed16Number(src, (offset + 16) as usize);
+    let e02 = read_s15Fixed16Number(src, (offset + 20) as usize);
+    let e10 = read_s15Fixed16Number(src, (offset + 24) as usize);
+    let e11 = read_s15Fixed16Number(src, (offset + 28) as usize);
+    let e12 = read_s15Fixed16Number(src, (offset + 32) as usize);
+    let e20 = read_s15Fixed16Number(src, (offset + 36) as usize);
+    let e21 = read_s15Fixed16Number(src, (offset + 40) as usize);
+    let e22 = read_s15Fixed16Number(src, (offset + 44) as usize);
+
+    let mut input_table = Vec::with_capacity((num_input_table_entries * in_chan as u16) as usize);
+    for i in 0..(num_input_table_entries * in_chan as u16) {
+        if type_0 == LUT8_TYPE {
+            input_table.push(uInt8Number_to_float(read_uInt8Number(
+                src,
+                (offset + input_offset) as usize + i as usize * entry_size,
+            )))
+        } else {
+            input_table.push(uInt16Number_to_float(read_uInt16Number(
+                src,
+                (offset + input_offset) as usize + i as usize * entry_size,
+            )))
+        }
+    }
+    let clut_offset = ((offset + input_offset) as usize
+        + (num_input_table_entries as i32 * in_chan as i32) as usize * entry_size)
+        as u32;
+
+    let mut clut_table = Vec::with_capacity((clut_size * out_chan as u32) as usize);
+    for i in 0..clut_size * out_chan as u32 {
+        if type_0 == LUT8_TYPE {
+            clut_table.push(uInt8Number_to_float(read_uInt8Number(
+                src,
+                clut_offset as usize + i as usize * entry_size,
+            )));
+        } else if type_0 == LUT16_TYPE {
+            clut_table.push(uInt16Number_to_float(read_uInt16Number(
+                src,
+                clut_offset as usize + i as usize * entry_size,
+            )));
+        }
+    }
+
+    let output_offset =
+        (clut_offset as usize + (clut_size * out_chan as u32) as usize * entry_size) as u32;
+
+    let mut output_table =
+        Vec::with_capacity((num_output_table_entries * out_chan as u16) as usize);
+    for i in 0..num_output_table_entries as i32 * out_chan as i32 {
+        if type_0 == LUT8_TYPE {
+            output_table.push(uInt8Number_to_float(read_uInt8Number(
+                src,
+                output_offset as usize + i as usize * entry_size,
+            )))
+        } else {
+            output_table.push(uInt16Number_to_float(read_uInt16Number(
+                src,
+                output_offset as usize + i as usize * entry_size,
+            )))
+        }
+    }
+    Some(Box::new(lutType {
+        num_input_table_entries,
+        num_output_table_entries,
+        num_input_channels: in_chan,
+        num_output_channels: out_chan,
+        num_clut_grid_points: grid_points,
+        e00,
+        e01,
+        e02,
+        e10,
+        e11,
+        e12,
+        e20,
+        e21,
+        e22,
+        input_table,
+        clut_table,
+        output_table,
+    }))
+}
+fn read_rendering_intent(mut profile: &mut Profile, src: &mut MemSource) {
+    let intent = read_u32(src, 64);
+    profile.rendering_intent = match intent {
+        x if x == Perceptual as u32 => Perceptual,
+        x if x == RelativeColorimetric as u32 => RelativeColorimetric,
+        x if x == Saturation as u32 => Saturation,
+        x if x == AbsoluteColorimetric as u32 => AbsoluteColorimetric,
+        _ => {
+            invalid_source(src, "unknown rendering intent");
+            Intent::default()
+        }
+    };
+}
+fn profile_create() -> Box<Profile> {
+    Box::new(Profile::default())
+}
+/* build sRGB gamma table */
+/* based on cmsBuildParametricGamma() */
+#[allow(clippy::many_single_char_names)]
+fn build_sRGB_gamma_table(num_entries: i32) -> Vec<u16> {
+    /* taken from lcms: Build_sRGBGamma() */
+    let gamma: f64 = 2.4;
+    let a: f64 = 1.0 / 1.055;
+    let b: f64 = 0.055 / 1.055;
+    let c: f64 = 1.0 / 12.92;
+    let d: f64 = 0.04045;
+
+    build_trc_table(
+        num_entries,
+        // IEC 61966-2.1 (sRGB)
+        // Y = (aX + b)^Gamma | X >= d
+        // Y = cX             | X < d
+        |x| {
+            if x >= d {
+                let e: f64 = a * x + b;
+                if e > 0. {
+                    e.powf(gamma)
+                } else {
+                    0.
+                }
+            } else {
+                c * x
+            }
+        },
+    )
+}
+
+/// eotf: electro-optical transfer characteristic function, maps from [0, 1]
+/// in non-linear (voltage) space to [0, 1] in linear (optical) space. Should
+/// generally be a concave up function.
+fn build_trc_table(num_entries: i32, eotf: impl Fn(f64) -> f64) -> Vec<u16> {
+    let mut table = Vec::with_capacity(num_entries as usize);
+
+    for i in 0..num_entries {
+        let x: f64 = i as f64 / (num_entries - 1) as f64;
+        let y: f64 = eotf(x);
+        let mut output: f64;
+        // Saturate -- this could likely move to a separate function
+        output = y * 65535.0 + 0.5;
+        if output > 65535.0 {
+            output = 65535.0
+        }
+        if output < 0.0 {
+            output = 0.0
+        }
+        table.push(output.floor() as u16);
+    }
+    table
+}
+fn curve_from_table(table: &[u16]) -> Box<curveType> {
+    Box::new(curveType::Curve(table.to_vec()))
+}
+pub fn float_to_u8Fixed8Number(a: f32) -> u16 {
+    if a > 255.0 + 255.0 / 256f32 {
+        0xffffu16
+    } else if a < 0.0 {
+        0u16
+    } else {
+        (a * 256.0 + 0.5).floor() as u16
+    }
+}
+
+fn curve_from_gamma(gamma: f32) -> Box<curveType> {
+    Box::new(curveType::Curve(vec![float_to_u8Fixed8Number(gamma)]))
+}
+
+fn identity_curve() -> Box<curveType> {
+    Box::new(curveType::Curve(Vec::new()))
+}
+
+/* from lcms: cmsWhitePointFromTemp */
+/* tempK must be >= 4000. and <= 25000.
+ * Invalid values of tempK will return
+ * (x,y,Y) = (-1.0, -1.0, -1.0)
+ * similar to argyll: icx_DTEMP2XYZ() */
+fn white_point_from_temp(temp_K: i32) -> qcms_CIE_xyY {
+    let mut white_point: qcms_CIE_xyY = qcms_CIE_xyY {
+        x: 0.,
+        y: 0.,
+        Y: 0.,
+    };
+    // No optimization provided.
+    let T = temp_K as f64; // Square
+    let T2 = T * T; // Cube
+    let T3 = T2 * T;
+    // For correlated color temperature (T) between 4000K and 7000K:
+    let x = if (4000.0..=7000.0).contains(&T) {
+        -4.6070 * (1E9 / T3) + 2.9678 * (1E6 / T2) + 0.09911 * (1E3 / T) + 0.244063
+    } else if T > 7000.0 && T <= 25000.0 {
+        -2.0064 * (1E9 / T3) + 1.9018 * (1E6 / T2) + 0.24748 * (1E3 / T) + 0.237040
+    } else {
+        // or for correlated color temperature (T) between 7000K and 25000K:
+        // Invalid tempK
+        white_point.x = -1.0;
+        white_point.y = -1.0;
+        white_point.Y = -1.0;
+        debug_assert!(false, "invalid temp");
+        return white_point;
+    };
+    // Obtain y(x)
+    let y = -3.000 * (x * x) + 2.870 * x - 0.275;
+    // wave factors (not used, but here for futures extensions)
+    // let M1 = (-1.3515 - 1.7703*x + 5.9114 *y)/(0.0241 + 0.2562*x - 0.7341*y);
+    // let M2 = (0.0300 - 31.4424*x + 30.0717*y)/(0.0241 + 0.2562*x - 0.7341*y);
+    // Fill white_point struct
+    white_point.x = x;
+    white_point.y = y;
+    white_point.Y = 1.0;
+    white_point
+}
+#[no_mangle]
+pub extern "C" fn qcms_white_point_sRGB() -> qcms_CIE_xyY {
+    white_point_from_temp(6504)
+}
+
+/// See [Rec. ITU-T H.273 (12/2016)](https://www.itu.int/rec/T-REC-H.273-201612-I/en) Table 2
+/// Values 0, 3, 13–21, 23–255 are all reserved so all map to the same variant
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum ColourPrimaries {
+    /// For future use by ITU-T | ISO/IEC
+    Reserved,
+    /// Rec. ITU-R BT.709-6<br />
+    /// Rec. ITU-R BT.1361-0 conventional colour gamut system and extended colour gamut system (historical)<br />
+    /// IEC 61966-2-1 sRGB or sYCC IEC 61966-2-4<br />
+    /// Society of Motion Picture and Television Engineers (MPTE) RP 177 (1993) Annex B<br />
+    Bt709 = 1,
+    /// Unspecified<br />
+    /// Image characteristics are unknown or are determined by the application.
+    Unspecified = 2,
+    /// Rec. ITU-R BT.470-6 System M (historical)<br />
+    /// United States National Television System Committee 1953 Recommendation for transmission standards for color television<br />
+    /// United States Federal Communications Commission (2003) Title 47 Code of Federal Regulations 73.682 (a) (20)<br />
+    Bt470M = 4,
+    /// Rec. ITU-R BT.470-6 System B, G (historical) Rec. ITU-R BT.601-7 625<br />
+    /// Rec. ITU-R BT.1358-0 625 (historical)<br />
+    /// Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM<br />
+    Bt470Bg = 5,
+    /// Rec. ITU-R BT.601-7 525<br />
+    /// Rec. ITU-R BT.1358-1 525 or 625 (historical) Rec. ITU-R BT.1700-0 NTSC<br />
+    /// SMPTE 170M (2004)<br />
+    /// (functionally the same as the value 7)<br />
+    Bt601 = 6,
+    /// SMPTE 240M (1999) (historical) (functionally the same as the value 6)<br />
+    Smpte240 = 7,
+    /// Generic film (colour filters using Illuminant C)<br />
+    Generic_film = 8,
+    /// Rec. ITU-R BT.2020-2<br />
+    /// Rec. ITU-R BT.2100-0<br />
+    Bt2020 = 9,
+    /// SMPTE ST 428-1<br />
+    /// (CIE 1931 XYZ as in ISO 11664-1)<br />
+    Xyz = 10,
+    /// SMPTE RP 431-2 (2011)<br />
+    Smpte431 = 11,
+    /// SMPTE EG 432-1 (2010)<br />
+    Smpte432 = 12,
+    /// EBU Tech. 3213-E (1975)<br />
+    Ebu3213 = 22,
+}
+
+impl From<u8> for ColourPrimaries {
+    fn from(value: u8) -> Self {
+        match value {
+            0 | 3 | 13..=21 | 23..=255 => Self::Reserved,
+            1 => Self::Bt709,
+            2 => Self::Unspecified,
+            4 => Self::Bt470M,
+            5 => Self::Bt470Bg,
+            6 => Self::Bt601,
+            7 => Self::Smpte240,
+            8 => Self::Generic_film,
+            9 => Self::Bt2020,
+            10 => Self::Xyz,
+            11 => Self::Smpte431,
+            12 => Self::Smpte432,
+            22 => Self::Ebu3213,
+        }
+    }
+}
+
+#[test]
+fn colour_primaries() {
+    for value in 0..=u8::MAX {
+        match ColourPrimaries::from(value) {
+            ColourPrimaries::Reserved => {}
+            variant => assert_eq!(value, variant as u8),
+        }
+    }
+}
+
+impl From<ColourPrimaries> for qcms_CIE_xyYTRIPLE {
+    fn from(value: ColourPrimaries) -> Self {
+        let red;
+        let green;
+        let blue;
+
+        match value {
+            ColourPrimaries::Reserved => panic!("CP={} is reserved", value as u8),
+            ColourPrimaries::Bt709 => {
+                green = qcms_chromaticity { x: 0.300, y: 0.600 };
+                blue = qcms_chromaticity { x: 0.150, y: 0.060 };
+                red = qcms_chromaticity { x: 0.640, y: 0.330 };
+            }
+            ColourPrimaries::Unspecified => panic!("CP={} is unspecified", value as u8),
+            ColourPrimaries::Bt470M => {
+                green = qcms_chromaticity { x: 0.21, y: 0.71 };
+                blue = qcms_chromaticity { x: 0.14, y: 0.08 };
+                red = qcms_chromaticity { x: 0.67, y: 0.33 };
+            }
+            ColourPrimaries::Bt470Bg => {
+                green = qcms_chromaticity { x: 0.29, y: 0.60 };
+                blue = qcms_chromaticity { x: 0.15, y: 0.06 };
+                red = qcms_chromaticity { x: 0.64, y: 0.33 };
+            }
+            ColourPrimaries::Bt601 | ColourPrimaries::Smpte240 => {
+                green = qcms_chromaticity { x: 0.310, y: 0.595 };
+                blue = qcms_chromaticity { x: 0.155, y: 0.070 };
+                red = qcms_chromaticity { x: 0.630, y: 0.340 };
+            }
+            ColourPrimaries::Generic_film => {
+                green = qcms_chromaticity { x: 0.243, y: 0.692 };
+                blue = qcms_chromaticity { x: 0.145, y: 0.049 };
+                red = qcms_chromaticity { x: 0.681, y: 0.319 };
+            }
+            ColourPrimaries::Bt2020 => {
+                green = qcms_chromaticity { x: 0.170, y: 0.797 };
+                blue = qcms_chromaticity { x: 0.131, y: 0.046 };
+                red = qcms_chromaticity { x: 0.708, y: 0.292 };
+            }
+            ColourPrimaries::Xyz => {
+                green = qcms_chromaticity { x: 0.0, y: 1.0 };
+                blue = qcms_chromaticity { x: 0.0, y: 0.0 };
+                red = qcms_chromaticity { x: 1.0, y: 0.0 };
+            }
+            // These two share primaries, but have distinct white points
+            ColourPrimaries::Smpte431 | ColourPrimaries::Smpte432 => {
+                green = qcms_chromaticity { x: 0.265, y: 0.690 };
+                blue = qcms_chromaticity { x: 0.150, y: 0.060 };
+                red = qcms_chromaticity { x: 0.680, y: 0.320 };
+            }
+            ColourPrimaries::Ebu3213 => {
+                green = qcms_chromaticity { x: 0.295, y: 0.605 };
+                blue = qcms_chromaticity { x: 0.155, y: 0.077 };
+                red = qcms_chromaticity { x: 0.630, y: 0.340 };
+            }
+        }
+
+        Self {
+            red: red.into(),
+            green: green.into(),
+            blue: blue.into(),
+        }
+    }
+}
+
+impl ColourPrimaries {
+    fn white_point(self) -> qcms_CIE_xyY {
+        match self {
+            Self::Reserved => panic!("CP={} is reserved", self as u8),
+            Self::Bt709
+            | Self::Bt470Bg
+            | Self::Bt601
+            | Self::Smpte240
+            | Self::Bt2020
+            | Self::Smpte432
+            | Self::Ebu3213 => qcms_chromaticity::D65,
+            Self::Unspecified => panic!("CP={} is unspecified", self as u8),
+            Self::Bt470M => qcms_chromaticity { x: 0.310, y: 0.316 },
+            Self::Generic_film => qcms_chromaticity { x: 0.310, y: 0.316 },
+            Self::Xyz => qcms_chromaticity {
+                x: 1. / 3.,
+                y: 1. / 3.,
+            },
+            Self::Smpte431 => qcms_chromaticity { x: 0.314, y: 0.351 },
+        }
+        .into()
+    }
+}
+
+/// See [Rec. ITU-T H.273 (12/2016)](https://www.itu.int/rec/T-REC-H.273-201612-I/en) Table 3
+/// Values 0, 3, 19–255 are all reserved so all map to the same variant
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum TransferCharacteristics {
+    /// For future use by ITU-T | ISO/IEC
+    Reserved,
+    /// Rec. ITU-R BT.709-6<br />
+    /// Rec. ITU-R BT.1361-0 conventional colour gamut system (historical)<br />
+    /// (functionally the same as the values 6, 14 and 15)    <br />
+    Bt709 = 1,
+    /// Image characteristics are unknown or are determined by the application.<br />
+    Unspecified = 2,
+    /// Rec. ITU-R BT.470-6 System M (historical)<br />
+    /// United States National Television System Committee 1953 Recommendation for transmission standards for color television<br />
+    /// United States Federal Communications Commission (2003) Title 47 Code of Federal Regulations 73.682 (a) (20)<br />
+    /// Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM<br />
+    Bt470M = 4,
+    /// Rec. ITU-R BT.470-6 System B, G (historical)<br />
+    Bt470Bg = 5,
+    /// Rec. ITU-R BT.601-7 525 or 625<br />
+    /// Rec. ITU-R BT.1358-1 525 or 625 (historical)<br />
+    /// Rec. ITU-R BT.1700-0 NTSC SMPTE 170M (2004)<br />
+    /// (functionally the same as the values 1, 14 and 15)<br />
+    Bt601 = 6,
+    /// SMPTE 240M (1999) (historical)<br />
+    Smpte240 = 7,
+    /// Linear transfer characteristics<br />
+    Linear = 8,
+    /// Logarithmic transfer characteristic (100:1 range)<br />
+    Log_100 = 9,
+    /// Logarithmic transfer characteristic (100 * Sqrt( 10 ) : 1 range)<br />
+    Log_100_sqrt10 = 10,
+    /// IEC 61966-2-4<br />
+    Iec61966 = 11,
+    /// Rec. ITU-R BT.1361-0 extended colour gamut system (historical)<br />
+    Bt_1361 = 12,
+    /// IEC 61966-2-1 sRGB or sYCC<br />
+    Srgb = 13,
+    /// Rec. ITU-R BT.2020-2 (10-bit system)<br />
+    /// (functionally the same as the values 1, 6 and 15)<br />
+    Bt2020_10bit = 14,
+    /// Rec. ITU-R BT.2020-2 (12-bit system)<br />
+    /// (functionally the same as the values 1, 6 and 14)<br />
+    Bt2020_12bit = 15,
+    /// SMPTE ST 2084 for 10-, 12-, 14- and 16-bitsystems<br />
+    /// Rec. ITU-R BT.2100-0 perceptual quantization (PQ) system<br />
+    Smpte2084 = 16,
+    /// SMPTE ST 428-1<br />
+    Smpte428 = 17,
+    /// ARIB STD-B67<br />
+    /// Rec. ITU-R BT.2100-0 hybrid log- gamma (HLG) system<br />
+    Hlg = 18,
+}
+
+#[test]
+fn transfer_characteristics() {
+    for value in 0..=u8::MAX {
+        match TransferCharacteristics::from(value) {
+            TransferCharacteristics::Reserved => {}
+            variant => assert_eq!(value, variant as u8),
+        }
+    }
+}
+
+impl From<u8> for TransferCharacteristics {
+    fn from(value: u8) -> Self {
+        match value {
+            0 | 3 | 19..=255 => Self::Reserved,
+            1 => Self::Bt709,
+            2 => Self::Unspecified,
+            4 => Self::Bt470M,
+            5 => Self::Bt470Bg,
+            6 => Self::Bt601,
+            7 => Self::Smpte240, // unimplemented
+            8 => Self::Linear,
+            9 => Self::Log_100,
+            10 => Self::Log_100_sqrt10,
+            11 => Self::Iec61966, // unimplemented
+            12 => Self::Bt_1361,  // unimplemented
+            13 => Self::Srgb,
+            14 => Self::Bt2020_10bit,
+            15 => Self::Bt2020_12bit,
+            16 => Self::Smpte2084,
+            17 => Self::Smpte428, // unimplemented
+            18 => Self::Hlg,
+        }
+    }
+}
+
+impl TryFrom<TransferCharacteristics> for curveType {
+    type Error = ();
+    /// See [ICC.1:2010](https://www.color.org/specification/ICC1v43_2010-12.pdf)
+    /// See [Rec. ITU-R BT.2100-2](https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2100-2-201807-I!!PDF-E.pdf)
+    fn try_from(value: TransferCharacteristics) -> Result<Self, Self::Error> {
+        const NUM_TRC_TABLE_ENTRIES: i32 = 1024;
+
+        Ok(match value {
+            TransferCharacteristics::Reserved => panic!("TC={} is reserved", value as u8),
+            TransferCharacteristics::Bt709
+            | TransferCharacteristics::Bt601
+            | TransferCharacteristics::Bt2020_10bit
+            | TransferCharacteristics::Bt2020_12bit => {
+                // The opto-electronic transfer characteristic function (OETF)
+                // as defined in ITU-T H.273 table 3, row 1:
+                //
+                // V = (α * Lc^0.45) − (α − 1)  for 1 >= Lc >= β
+                // V = 4.500 * Lc               for β >  Lc >= 0
+                //
+                // Inverting gives the electro-optical transfer characteristic
+                // function (EOTF) which can be represented as ICC
+                // parametricCurveType with 4 parameters (ICC.1:2010 Table 5).
+                // Converting between the two (Lc ↔︎ Y, V ↔︎ X):
+                //
+                // Y = (a * X + b)^g  for (X >= d)
+                // Y = c * X          for (X < d)
+                //
+                // g, a, b, c, d can then be defined in terms of α and β:
+                //
+                // g = 1 / 0.45
+                // a = 1 / α
+                // b = 1 - α
+                // c = 1 / 4.500
+                // d = 4.500 * β
+                //
+                // α and β are determined by solving the piecewise equations to
+                // ensure continuity of both value and slope at the value β.
+                // We use the values specified for 10-bit systems in
+                // https://www.itu.int/rec/R-REC-BT.2020-2-201510-I Table 4
+                // since this results in the similar values as available ICC
+                // profiles after converting to s15Fixed16Number, providing us
+                // good test coverage.
+
+                type Float = f32;
+
+                const alpha: Float = 1.099;
+                const beta: Float = 0.018;
+
+                const linear_coef: Float = 4.500;
+                const pow_exp: Float = 0.45;
+
+                const g: Float = 1. / pow_exp;
+                const a: Float = 1. / alpha;
+                const b: Float = 1. - a;
+                const c: Float = 1. / linear_coef;
+                const d: Float = linear_coef * beta;
+
+                curveType::Parametric(vec![g, a, b, c, d])
+            }
+            TransferCharacteristics::Unspecified => panic!("TC={} is unspecified", value as u8),
+            TransferCharacteristics::Bt470M => *curve_from_gamma(2.2),
+            TransferCharacteristics::Bt470Bg => *curve_from_gamma(2.8),
+            TransferCharacteristics::Smpte240 => return Err(()),
+            TransferCharacteristics::Linear => *curve_from_gamma(1.),
+            TransferCharacteristics::Log_100 => {
+                // See log_100_transfer_characteristics() for derivation
+                // The opto-electronic transfer characteristic function (OETF)
+                // as defined in ITU-T H.273 table 3, row 9:
+                //
+                // V = 1.0 + Log10(Lc) ÷ 2  for 1    >= Lc >= 0.01
+                // V = 0.0                  for 0.01 >  Lc >= 0
+                //
+                // Inverting this to give the EOTF required for the profile gives
+                //
+                // Lc = 10^(2*V - 2)  for 1 >= V >= 0
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, |v| 10f64.powf(2. * v - 2.));
+                curveType::Curve(table)
+            }
+            TransferCharacteristics::Log_100_sqrt10 => {
+                // The opto-electronic transfer characteristic function (OETF)
+                // as defined in ITU-T H.273 table 3, row 10:
+                //
+                // V = 1.0 + Log10(Lc) ÷ 2.5  for               1 >= Lc >= Sqrt(10) ÷ 1000
+                // V = 0.0                    for Sqrt(10) ÷ 1000 >  Lc >= 0
+                //
+                // Inverting this to give the EOTF required for the profile gives
+                //
+                // Lc = 10^(2.5*V - 2.5)  for 1 >= V >= 0
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, |v| 10f64.powf(2.5 * v - 2.5));
+                curveType::Curve(table)
+            }
+            TransferCharacteristics::Iec61966 => return Err(()),
+            TransferCharacteristics::Bt_1361 => return Err(()),
+            TransferCharacteristics::Srgb => {
+                // Should we prefer this or curveType::Parametric?
+                curveType::Curve(build_sRGB_gamma_table(NUM_TRC_TABLE_ENTRIES))
+            }
+
+            TransferCharacteristics::Smpte2084 => {
+                // Despite using Lo rather than Lc, H.273 gives the OETF:
+                //
+                // V = ( ( c1 + c2 * (Lo)^n ) ÷ ( 1 + c3 * (Lo)^n ) )^m
+                const c1: f64 = 0.8359375;
+                const c2: f64 = 18.8515625;
+                const c3: f64 = 18.6875;
+                const m: f64 = 78.84375;
+                const n: f64 = 0.1593017578125;
+
+                // Inverting this to give the EOTF required for the profile
+                // (and confirmed by Rec. ITU-R BT.2100-2, Table 4) gives
+                //
+                // Y = ( max[( X^(1/m) - c1 ), 0] ÷ ( c2 - c3 * X^(1/m) ) )^(1/n)
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, |x| {
+                    ((x.powf(1. / m) - c1).max(0.) / (c2 - c3 * x.powf(1. / m))).powf(1. / n)
+                });
+                curveType::Curve(table)
+            }
+            TransferCharacteristics::Smpte428 => return Err(()),
+            TransferCharacteristics::Hlg => {
+                // The opto-electronic transfer characteristic function (OETF)
+                // as defined in ITU-T H.273 table 3, row 18:
+                //
+                // V = a * Ln(12 * Lc - b) + c  for 1      >= Lc >  1 ÷ 12
+                // V = Sqrt(3) * Lc^0.5         for 1 ÷ 12 >= Lc >= 0
+                const a: f64 = 0.17883277;
+                const b: f64 = 0.28466892;
+                const c: f64 = 0.55991073;
+
+                // Inverting this to give the EOTF required for the profile
+                // (and confirmed by Rec. ITU-R BT.2100-2, Table 4) gives
+                //
+                // Y = (X^2) / 3             for 0   <= X <= 0.5
+                // Y = ((e^((X-c)/a))+b)/12  for 0.5 <  X <= 1
+                let table = build_trc_table(NUM_TRC_TABLE_ENTRIES, |x| {
+                    if x <= 0.5 {
+                        let y1 = x.powf(2.) / 3.;
+                        assert!((0. ..=1. / 12.).contains(&y1));
+                        y1
+                    } else {
+                        (std::f64::consts::E.powf((x - c) / a) + b) / 12.
+                    }
+                });
+                curveType::Curve(table)
+            }
+        })
+    }
+}
+
+#[cfg(test)]
+fn check_transfer_characteristics(cicp: TransferCharacteristics, icc_path: &str) {
+    let mut cicp_out = [0u8; crate::transform::PRECACHE_OUTPUT_SIZE];
+    let mut icc_out = [0u8; crate::transform::PRECACHE_OUTPUT_SIZE];
+    let cicp_tc = curveType::try_from(cicp).unwrap();
+    let icc = Profile::new_from_path(icc_path).unwrap();
+    let icc_tc = icc.redTRC.as_ref().unwrap();
+
+    eprintln!("cicp_tc: {:?}", cicp_tc);
+    eprintln!("icc_tc: {:?}", icc_tc);
+
+    crate::transform_util::compute_precache(icc_tc, &mut icc_out);
+    crate::transform_util::compute_precache(&cicp_tc, &mut cicp_out);
+
+    let mut off_by_one = 0;
+    for i in 0..cicp_out.len() {
+        match (cicp_out[i] as i16) - (icc_out[i] as i16) {
+            0 => {}
+            1 | -1 => {
+                off_by_one += 1;
+            }
+            _ => assert_eq!(cicp_out[i], icc_out[i], "difference at index {}", i),
+        }
+    }
+    eprintln!("{} / {} off by one", off_by_one, cicp_out.len());
+}
+
+#[test]
+fn srgb_transfer_characteristics() {
+    check_transfer_characteristics(TransferCharacteristics::Srgb, "sRGB_lcms.icc");
+}
+
+#[test]
+fn bt709_transfer_characteristics() {
+    check_transfer_characteristics(TransferCharacteristics::Bt709, "ITU-709.icc");
+}
+
+#[test]
+fn bt2020_10bit_transfer_characteristics() {
+    check_transfer_characteristics(TransferCharacteristics::Bt2020_10bit, "ITU-2020.icc");
+}
+
+#[test]
+fn bt2020_12bit_transfer_characteristics() {
+    check_transfer_characteristics(TransferCharacteristics::Bt2020_12bit, "ITU-2020.icc");
+}
+
+impl Profile {
+    //XXX: it would be nice if we had a way of ensuring
+    // everything in a profile was initialized regardless of how it was created
+    //XXX: should this also be taking a black_point?
+    /* similar to CGColorSpaceCreateCalibratedRGB */
+    pub fn new_rgb_with_table(
+        white_point: qcms_CIE_xyY,
+        primaries: qcms_CIE_xyYTRIPLE,
+        table: &[u16],
+    ) -> Option<Box<Profile>> {
+        let mut profile = profile_create();
+        //XXX: should store the whitepoint
+        if !set_rgb_colorants(&mut profile, white_point, primaries) {
+            return None;
+        }
+        profile.redTRC = Some(curve_from_table(table));
+        profile.blueTRC = Some(curve_from_table(table));
+        profile.greenTRC = Some(curve_from_table(table));
+        profile.class_type = DISPLAY_DEVICE_PROFILE;
+        profile.rendering_intent = Perceptual;
+        profile.color_space = RGB_SIGNATURE;
+        profile.pcs = XYZ_TYPE;
+        Some(profile)
+    }
+    pub fn new_sRGB() -> Box<Profile> {
+        let D65 = qcms_white_point_sRGB();
+        let table = build_sRGB_gamma_table(1024);
+
+        let mut srgb = Profile::new_rgb_with_table(
+            D65,
+            qcms_CIE_xyYTRIPLE::from(ColourPrimaries::Bt709),
+            &table,
+        )
+        .unwrap();
+        srgb.is_srgb = true;
+        srgb
+    }
+
+    /// Returns true if this profile is sRGB
+    pub fn is_sRGB(&self) -> bool {
+        self.is_srgb
+    }
+
+    pub(crate) fn new_sRGB_parametric() -> Box<Profile> {
+        let primaries = qcms_CIE_xyYTRIPLE::from(ColourPrimaries::Bt709);
+        let white_point = qcms_white_point_sRGB();
+        let mut profile = profile_create();
+        set_rgb_colorants(&mut profile, white_point, primaries);
+
+        let curve = Box::new(curveType::Parametric(vec![
+            2.4,
+            1. / 1.055,
+            0.055 / 1.055,
+            1. / 12.92,
+            0.04045,
+        ]));
+        profile.redTRC = Some(curve.clone());
+        profile.blueTRC = Some(curve.clone());
+        profile.greenTRC = Some(curve);
+        profile.class_type = DISPLAY_DEVICE_PROFILE;
+        profile.rendering_intent = Perceptual;
+        profile.color_space = RGB_SIGNATURE;
+        profile.pcs = XYZ_TYPE;
+        profile.is_srgb = true;
+        profile
+    }
+
+    /// Create a new profile with D50 adopted white and identity transform functions
+    pub fn new_XYZD50() -> Box<Profile> {
+        let mut profile = profile_create();
+        profile.redColorant.X = double_to_s15Fixed16Number(1.);
+        profile.redColorant.Y = double_to_s15Fixed16Number(0.);
+        profile.redColorant.Z = double_to_s15Fixed16Number(0.);
+        profile.greenColorant.X = double_to_s15Fixed16Number(0.);
+        profile.greenColorant.Y = double_to_s15Fixed16Number(1.);
+        profile.greenColorant.Z = double_to_s15Fixed16Number(0.);
+        profile.blueColorant.X = double_to_s15Fixed16Number(0.);
+        profile.blueColorant.Y = double_to_s15Fixed16Number(0.);
+        profile.blueColorant.Z = double_to_s15Fixed16Number(1.);
+        profile.redTRC = Some(identity_curve());
+        profile.blueTRC = Some(identity_curve());
+        profile.greenTRC = Some(identity_curve());
+
+        profile.class_type = DISPLAY_DEVICE_PROFILE;
+        profile.rendering_intent = Perceptual;
+        profile.color_space = RGB_SIGNATURE;
+        profile.pcs = XYZ_TYPE;
+        profile
+    }
+
+    pub fn new_cicp(cp: ColourPrimaries, tc: TransferCharacteristics) -> Option<Box<Profile>> {
+        let mut profile = profile_create();
+        //XXX: should store the whitepoint
+        if !set_rgb_colorants(&mut profile, cp.white_point(), qcms_CIE_xyYTRIPLE::from(cp)) {
+            return None;
+        }
+        let curve = curveType::try_from(tc).ok()?;
+        profile.redTRC = Some(Box::new(curve.clone()));
+        profile.blueTRC = Some(Box::new(curve.clone()));
+        profile.greenTRC = Some(Box::new(curve));
+        profile.class_type = DISPLAY_DEVICE_PROFILE;
+        profile.rendering_intent = Perceptual;
+        profile.color_space = RGB_SIGNATURE;
+        profile.pcs = XYZ_TYPE;
+
+        profile.is_srgb = (cp, tc) == (ColourPrimaries::Bt709, TransferCharacteristics::Srgb);
+        Some(profile)
+    }
+
+    pub fn new_gray_with_gamma(gamma: f32) -> Box<Profile> {
+        let mut profile = profile_create();
+
+        profile.grayTRC = Some(curve_from_gamma(gamma));
+        profile.class_type = DISPLAY_DEVICE_PROFILE;
+        profile.rendering_intent = Perceptual;
+        profile.color_space = GRAY_SIGNATURE;
+        profile.pcs = XYZ_TYPE;
+        profile
+    }
+
+    pub fn new_rgb_with_gamma_set(
+        white_point: qcms_CIE_xyY,
+        primaries: qcms_CIE_xyYTRIPLE,
+        redGamma: f32,
+        greenGamma: f32,
+        blueGamma: f32,
+    ) -> Option<Box<Profile>> {
+        let mut profile = profile_create();
+
+        //XXX: should store the whitepoint
+        if !set_rgb_colorants(&mut profile, white_point, primaries) {
+            return None;
+        }
+        profile.redTRC = Some(curve_from_gamma(redGamma));
+        profile.blueTRC = Some(curve_from_gamma(blueGamma));
+        profile.greenTRC = Some(curve_from_gamma(greenGamma));
+        profile.class_type = DISPLAY_DEVICE_PROFILE;
+        profile.rendering_intent = Perceptual;
+        profile.color_space = RGB_SIGNATURE;
+        profile.pcs = XYZ_TYPE;
+        Some(profile)
+    }
+
+    pub fn new_from_path(file: &str) -> Option<Box<Profile>> {
+        Profile::new_from_slice(&std::fs::read(file).ok()?, false)
+    }
+
+    pub fn new_from_slice(mem: &[u8], curves_only: bool) -> Option<Box<Profile>> {
+        let length: u32;
+        let mut source: MemSource = MemSource {
+            buf: mem,
+            valid: false,
+            invalid_reason: None,
+        };
+        let index;
+        source.valid = true;
+        let mut src: &mut MemSource = &mut source;
+        if mem.len() < 4 {
+            return None;
+        }
+        length = read_u32(src, 0);
+        if length as usize <= mem.len() {
+            // shrink the area that we can read if appropriate
+            src.buf = &src.buf[0..length as usize];
+        } else {
+            return None;
+        }
+        /* ensure that the profile size is sane so it's easier to reason about */
+        if src.buf.len() <= 64 || src.buf.len() >= MAX_PROFILE_SIZE {
+            return None;
+        }
+        let mut profile = profile_create();
+
+        check_CMM_type_signature(src);
+        check_profile_version(src);
+        read_class_signature(&mut profile, src);
+        read_rendering_intent(&mut profile, src);
+        read_color_space(&mut profile, src);
+        read_pcs(&mut profile, src);
+        //TODO read rest of profile stuff
+        if !src.valid {
+            return None;
+        }
+
+        index = read_tag_table(&mut profile, src);
+        if !src.valid || index.is_empty() {
+            return None;
+        }
+
+        if let Some(chad) = find_tag(&index, TAG_CHAD) {
+            profile.chromaticAdaption = Some(read_tag_s15Fixed16ArrayType(src, chad))
+        } else {
+            profile.chromaticAdaption = None; //Signal the data is not present
+        }
+
+        if profile.class_type == DISPLAY_DEVICE_PROFILE
+            || profile.class_type == INPUT_DEVICE_PROFILE
+            || profile.class_type == OUTPUT_DEVICE_PROFILE
+            || profile.class_type == COLOR_SPACE_PROFILE
+        {
+            if profile.color_space == RGB_SIGNATURE {
+                if !curves_only {
+                    if let Some(A2B0) = find_tag(&index, TAG_A2B0) {
+                        let lut_type = read_u32(src, A2B0.offset as usize);
+                        if lut_type == LUT8_TYPE || lut_type == LUT16_TYPE {
+                            profile.A2B0 = read_tag_lutType(src, A2B0)
+                        } else if lut_type == LUT_MAB_TYPE {
+                            profile.mAB = read_tag_lutmABType(src, A2B0)
+                        }
+                    }
+                    if let Some(B2A0) = find_tag(&index, TAG_B2A0) {
+                        let lut_type = read_u32(src, B2A0.offset as usize);
+                        if lut_type == LUT8_TYPE || lut_type == LUT16_TYPE {
+                            profile.B2A0 = read_tag_lutType(src, B2A0)
+                        } else if lut_type == LUT_MBA_TYPE {
+                            profile.mBA = read_tag_lutmABType(src, B2A0)
+                        }
+                    }
+                }
+                if find_tag(&index, TAG_rXYZ).is_some() || curves_only {
+                    profile.redColorant = read_tag_XYZType(src, &index, TAG_rXYZ);
+                    profile.greenColorant = read_tag_XYZType(src, &index, TAG_gXYZ);
+                    profile.blueColorant = read_tag_XYZType(src, &index, TAG_bXYZ)
+                }
+                if !src.valid {
+                    return None;
+                }
+
+                if find_tag(&index, TAG_rTRC).is_some() || curves_only {
+                    profile.redTRC = read_tag_curveType(src, &index, TAG_rTRC);
+                    profile.greenTRC = read_tag_curveType(src, &index, TAG_gTRC);
+                    profile.blueTRC = read_tag_curveType(src, &index, TAG_bTRC);
+                    if profile.redTRC.is_none()
+                        || profile.blueTRC.is_none()
+                        || profile.greenTRC.is_none()
+                    {
+                        return None;
+                    }
+                }
+            } else if profile.color_space == GRAY_SIGNATURE {
+                profile.grayTRC = read_tag_curveType(src, &index, TAG_kTRC);
+                profile.grayTRC.as_ref()?;
+            } else if profile.color_space == CMYK_SIGNATURE {
+                if let Some(A2B0) = find_tag(&index, TAG_A2B0) {
+                    let lut_type = read_u32(src, A2B0.offset as usize);
+                    if lut_type == LUT8_TYPE || lut_type == LUT16_TYPE {
+                        profile.A2B0 = read_tag_lutType(src, A2B0)
+                    } else if lut_type == LUT_MBA_TYPE {
+                        profile.mAB = read_tag_lutmABType(src, A2B0)
+                    }
+                }
+            } else {
+                debug_assert!(false, "read_color_space protects against entering here");
+                return None;
+            }
+        } else {
+            return None;
+        }
+
+        if !src.valid {
+            return None;
+        }
+        Some(profile)
+    }
+    /// Precomputes the information needed for this profile to be
+    /// used as the output profile when constructing a `Transform`.
+    pub fn precache_output_transform(&mut self) {
+        crate::transform::qcms_profile_precache_output_transform(self);
+    }
+}
diff --git a/gfx/qcms/src/lib.rs b/gfx/qcms/src/lib.rs
new file mode 100644
index 0000000000..c311964ee3
--- /dev/null
+++ b/gfx/qcms/src/lib.rs
@@ -0,0 +1,72 @@
+/*! A pure Rust color management library.
+*/
+
+#![allow(dead_code)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(non_upper_case_globals)]
+// These are needed for the neon SIMD code and can be removed once the MSRV supports the
+// instrinsics we use
+#![cfg_attr(feature = "neon", feature(stdsimd))]
+#![cfg_attr(
+    feature = "neon",
+    feature(arm_target_feature, raw_ref_op)
+
+)]
+
+/// These values match the Rendering Intent values from the ICC spec
+#[repr(C)]
+#[derive(Clone, Copy, Debug)]
+pub enum Intent {
+    AbsoluteColorimetric = 3,
+    Saturation = 2,
+    RelativeColorimetric = 1,
+    Perceptual = 0,
+}
+
+use Intent::*;
+
+impl Default for Intent {
+    fn default() -> Self {
+        /* Chris Murphy (CM consultant) suggests this as a default in the event that we
+         * cannot reproduce relative + Black Point Compensation.  BPC brings an
+         * unacceptable performance overhead, so we go with perceptual. */
+        Perceptual
+    }
+}
+
+pub(crate) type s15Fixed16Number = i32;
+
+/* produces the nearest float to 'a' with a maximum error
+ * of 1/1024 which happens for large values like 0x40000040 */
+#[inline]
+fn s15Fixed16Number_to_float(a: s15Fixed16Number) -> f32 {
+    a as f32 / 65536.0
+}
+
+#[inline]
+fn double_to_s15Fixed16Number(v: f64) -> s15Fixed16Number {
+    (v * 65536f64) as i32
+}
+
+#[cfg(feature = "c_bindings")]
+extern crate libc;
+#[cfg(feature = "c_bindings")]
+pub mod c_bindings;
+mod chain;
+mod gtest;
+mod iccread;
+mod matrix;
+mod transform;
+pub use iccread::qcms_CIE_xyY as CIE_xyY;
+pub use iccread::qcms_CIE_xyYTRIPLE as CIE_xyYTRIPLE;
+pub use iccread::Profile;
+pub use transform::DataType;
+pub use transform::Transform;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod transform_avx;
+#[cfg(all(any(target_arch = "aarch64", target_arch = "arm"), feature = "neon"))]
+mod transform_neon;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod transform_sse2;
+mod transform_util;
diff --git a/gfx/qcms/src/matrix.rs b/gfx/qcms/src/matrix.rs
new file mode 100644
index 0000000000..8cd450241e
--- /dev/null
+++ b/gfx/qcms/src/matrix.rs
@@ -0,0 +1,134 @@
+//  qcms
+//  Copyright (C) 2009 Mozilla Foundation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#[derive(Copy, Clone, Debug, Default)]
+pub struct Matrix {
+    pub m: [[f32; 3]; 3], // Three rows of three elems.
+}
+
+#[derive(Copy, Clone)]
+pub struct Vector {
+    pub v: [f32; 3],
+}
+
+impl Matrix {
+    pub fn eval(&self, v: Vector) -> Vector {
+        let mut result: Vector = Vector { v: [0.; 3] };
+        result.v[0] = self.m[0][0] * v.v[0] + self.m[0][1] * v.v[1] + self.m[0][2] * v.v[2];
+        result.v[1] = self.m[1][0] * v.v[0] + self.m[1][1] * v.v[1] + self.m[1][2] * v.v[2];
+        result.v[2] = self.m[2][0] * v.v[0] + self.m[2][1] * v.v[1] + self.m[2][2] * v.v[2];
+        result
+    }
+
+    pub fn row(&self, r: usize) -> [f32; 3] {
+        self.m[r]
+    }
+
+    //probably reuse this computation in matrix_invert
+    pub fn det(&self) -> f32 {
+        let det: f32 = self.m[0][0] * self.m[1][1] * self.m[2][2]
+            + self.m[0][1] * self.m[1][2] * self.m[2][0]
+            + self.m[0][2] * self.m[1][0] * self.m[2][1]
+            - self.m[0][0] * self.m[1][2] * self.m[2][1]
+            - self.m[0][1] * self.m[1][0] * self.m[2][2]
+            - self.m[0][2] * self.m[1][1] * self.m[2][0];
+        det
+    }
+    /* from pixman and cairo and Mathematics for Game Programmers */
+    /* lcms uses gauss-jordan elimination with partial pivoting which is
+     * less efficient and not as numerically stable. See Mathematics for
+     * Game Programmers. */
+    pub fn invert(&self) -> Option<Matrix> {
+        let mut dest_mat: Matrix = Matrix { m: [[0.; 3]; 3] };
+        let mut i: i32;
+
+        const a: [i32; 3] = [2, 2, 1];
+        const b: [i32; 3] = [1, 0, 0];
+        /* inv  (A) = 1/det (A) * adj (A) */
+        let mut det: f32 = self.det();
+        if det == 0. {
+            return None;
+        }
+        det = 1. / det;
+        let mut j: i32 = 0;
+        while j < 3 {
+            i = 0;
+            while i < 3 {
+                let ai: i32 = a[i as usize];
+                let aj: i32 = a[j as usize];
+                let bi: i32 = b[i as usize];
+                let bj: i32 = b[j as usize];
+                let mut p: f64 = (self.m[ai as usize][aj as usize]
+                    * self.m[bi as usize][bj as usize]
+                    - self.m[ai as usize][bj as usize] * self.m[bi as usize][aj as usize])
+                    as f64;
+                if ((i + j) & 1) != 0 {
+                    p = -p
+                }
+                dest_mat.m[j as usize][i as usize] = (det as f64 * p) as f32;
+                i += 1
+            }
+            j += 1
+        }
+        Some(dest_mat)
+    }
+    pub fn identity() -> Matrix {
+        let mut i: Matrix = Matrix { m: [[0.; 3]; 3] };
+        i.m[0][0] = 1.;
+        i.m[0][1] = 0.;
+        i.m[0][2] = 0.;
+        i.m[1][0] = 0.;
+        i.m[1][1] = 1.;
+        i.m[1][2] = 0.;
+        i.m[2][0] = 0.;
+        i.m[2][1] = 0.;
+        i.m[2][2] = 1.;
+        i
+    }
+    pub fn invalid() -> Option<Matrix> {
+        None
+    }
+    /* from pixman */
+    /* MAT3per... */
+    pub fn multiply(a: Matrix, b: Matrix) -> Matrix {
+        let mut result: Matrix = Matrix { m: [[0.; 3]; 3] };
+        let mut dx: i32;
+
+        let mut o: i32;
+        let mut dy: i32 = 0;
+        while dy < 3 {
+            dx = 0;
+            while dx < 3 {
+                let mut v: f64 = 0f64;
+                o = 0;
+                while o < 3 {
+                    v += (a.m[dy as usize][o as usize] * b.m[o as usize][dx as usize]) as f64;
+                    o += 1
+                }
+                result.m[dy as usize][dx as usize] = v as f32;
+                dx += 1
+            }
+            dy += 1
+        }
+        result
+    }
+}
diff --git a/gfx/qcms/src/transform.rs b/gfx/qcms/src/transform.rs
new file mode 100644
index 0000000000..cfca37be4c
--- /dev/null
+++ b/gfx/qcms/src/transform.rs
@@ -0,0 +1,1571 @@
+//  qcms
+//  Copyright (C) 2009 Mozilla Foundation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#![allow(clippy::missing_safety_doc)]
+#[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "neon"))]
+use crate::transform_neon::{
+    qcms_transform_data_bgra_out_lut_neon, qcms_transform_data_rgb_out_lut_neon,
+    qcms_transform_data_rgba_out_lut_neon,
+};
+use crate::{
+    chain::chain_transform,
+    double_to_s15Fixed16Number,
+    iccread::SUPPORTS_ICCV4,
+    matrix::*,
+    transform_util::{
+        build_colorant_matrix, build_input_gamma_table, build_output_lut, compute_precache,
+        lut_interp_linear,
+    },
+};
+use crate::{
+    iccread::{qcms_CIE_xyY, qcms_CIE_xyYTRIPLE, Profile, GRAY_SIGNATURE, RGB_SIGNATURE},
+    transform_util::clamp_float,
+    Intent,
+};
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::{
+    transform_avx::{
+        qcms_transform_data_bgra_out_lut_avx, qcms_transform_data_rgb_out_lut_avx,
+        qcms_transform_data_rgba_out_lut_avx,
+    },
+    transform_sse2::{
+        qcms_transform_data_bgra_out_lut_sse2, qcms_transform_data_rgb_out_lut_sse2,
+        qcms_transform_data_rgba_out_lut_sse2,
+    },
+};
+
+use std::sync::atomic::Ordering;
+use std::sync::Arc;
+#[cfg(all(target_arch = "arm", feature = "neon"))]
+use std::arch::is_arm_feature_detected;
+#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+use std::arch::is_aarch64_feature_detected;
+
+pub const PRECACHE_OUTPUT_SIZE: usize = 8192;
+pub const PRECACHE_OUTPUT_MAX: usize = PRECACHE_OUTPUT_SIZE - 1;
+pub const FLOATSCALE: f32 = PRECACHE_OUTPUT_SIZE as f32;
+pub const CLAMPMAXVAL: f32 = ((PRECACHE_OUTPUT_SIZE - 1) as f32) / PRECACHE_OUTPUT_SIZE as f32;
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct PrecacheOuput {
+    /* We previously used a count of 65536 here but that seems like more
+     * precision than we actually need.  By reducing the size we can
+     * improve startup performance and reduce memory usage. ColorSync on
+     * 10.5 uses 4097 which is perhaps because they use a fixed point
+     * representation where 1. is represented by 0x1000. */
+    pub data: [u8; PRECACHE_OUTPUT_SIZE],
+}
+
+impl Default for PrecacheOuput {
+    fn default() -> PrecacheOuput {
+        PrecacheOuput {
+            data: [0; PRECACHE_OUTPUT_SIZE],
+        }
+    }
+}
+
+/* used as a lookup table for the output transformation.
+ * we refcount them so we only need to have one around per output
+ * profile, instead of duplicating them per transform */
+
+#[repr(C)]
+#[repr(align(16))]
+#[derive(Clone, Default)]
+pub struct qcms_transform {
+    pub matrix: [[f32; 4]; 3],
+    pub input_gamma_table_r: Option<Box<[f32; 256]>>,
+    pub input_gamma_table_g: Option<Box<[f32; 256]>>,
+    pub input_gamma_table_b: Option<Box<[f32; 256]>>,
+    pub input_clut_table_length: u16,
+    pub clut: Option<Vec<f32>>,
+    pub grid_size: u16,
+    pub output_clut_table_length: u16,
+    pub input_gamma_table_gray: Option<Box<[f32; 256]>>,
+    pub out_gamma_r: f32,
+    pub out_gamma_g: f32,
+    pub out_gamma_b: f32,
+    pub out_gamma_gray: f32,
+    pub output_gamma_lut_r: Option<Vec<u16>>,
+    pub output_gamma_lut_g: Option<Vec<u16>>,
+    pub output_gamma_lut_b: Option<Vec<u16>>,
+    pub output_gamma_lut_gray: Option<Vec<u16>>,
+    pub output_gamma_lut_r_length: usize,
+    pub output_gamma_lut_g_length: usize,
+    pub output_gamma_lut_b_length: usize,
+    pub output_gamma_lut_gray_length: usize,
+    pub output_table_r: Option<Arc<PrecacheOuput>>,
+    pub output_table_g: Option<Arc<PrecacheOuput>>,
+    pub output_table_b: Option<Arc<PrecacheOuput>>,
+    pub transform_fn: transform_fn_t,
+}
+
+pub type transform_fn_t =
+    Option<unsafe fn(_: &qcms_transform, _: *const u8, _: *mut u8, _: usize) -> ()>;
+/// The format of pixel data
+#[repr(u32)]
+#[derive(PartialEq, Eq, Clone, Copy)]
+#[allow(clippy::upper_case_acronyms)]
+pub enum DataType {
+    RGB8 = 0,
+    RGBA8 = 1,
+    BGRA8 = 2,
+    Gray8 = 3,
+    GrayA8 = 4,
+    CMYK = 5,
+}
+
+impl DataType {
+    pub fn bytes_per_pixel(&self) -> usize {
+        match self {
+            RGB8 => 3,
+            RGBA8 => 4,
+            BGRA8 => 4,
+            Gray8 => 1,
+            GrayA8 => 2,
+            CMYK => 4,
+        }
+    }
+}
+
+use DataType::*;
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+#[allow(clippy::upper_case_acronyms)]
+pub struct CIE_XYZ {
+    pub X: f64,
+    pub Y: f64,
+    pub Z: f64,
+}
+
+pub trait Format {
+    const kRIndex: usize;
+    const kGIndex: usize;
+    const kBIndex: usize;
+    const kAIndex: usize;
+}
+
+#[allow(clippy::upper_case_acronyms)]
+pub struct BGRA;
+impl Format for BGRA {
+    const kBIndex: usize = 0;
+    const kGIndex: usize = 1;
+    const kRIndex: usize = 2;
+    const kAIndex: usize = 3;
+}
+
+#[allow(clippy::upper_case_acronyms)]
+pub struct RGBA;
+impl Format for RGBA {
+    const kRIndex: usize = 0;
+    const kGIndex: usize = 1;
+    const kBIndex: usize = 2;
+    const kAIndex: usize = 3;
+}
+
+#[allow(clippy::upper_case_acronyms)]
+pub struct RGB;
+impl Format for RGB {
+    const kRIndex: usize = 0;
+    const kGIndex: usize = 1;
+    const kBIndex: usize = 2;
+    const kAIndex: usize = 0xFF;
+}
+
+pub trait GrayFormat {
+    const has_alpha: bool;
+}
+
+pub struct Gray;
+impl GrayFormat for Gray {
+    const has_alpha: bool = false;
+}
+
+pub struct GrayAlpha;
+impl GrayFormat for GrayAlpha {
+    const has_alpha: bool = true;
+}
+
+#[inline]
+fn clamp_u8(v: f32) -> u8 {
+    if v > 255. {
+        255
+    } else if v < 0. {
+        0
+    } else {
+        (v + 0.5).floor() as u8
+    }
+}
+
+// Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
+// This is just an approximation, I am not handling all the non-linear
+// aspects of the RGB to XYZ process, and assumming that the gamma correction
+// has transitive property in the tranformation chain.
+//
+// the alghoritm:
+//
+//            - First I build the absolute conversion matrix using
+//              primaries in XYZ. This matrix is next inverted
+//            - Then I eval the source white point across this matrix
+//              obtaining the coeficients of the transformation
+//            - Then, I apply these coeficients to the original matrix
+fn build_RGB_to_XYZ_transfer_matrix(
+    white: qcms_CIE_xyY,
+    primrs: qcms_CIE_xyYTRIPLE,
+) -> Option<Matrix> {
+    let mut primaries: Matrix = Matrix { m: [[0.; 3]; 3] };
+
+    let mut result: Matrix = Matrix { m: [[0.; 3]; 3] };
+    let mut white_point: Vector = Vector { v: [0.; 3] };
+
+    let xn: f64 = white.x;
+    let yn: f64 = white.y;
+    if yn == 0.0f64 {
+        return None;
+    }
+
+    let xr: f64 = primrs.red.x;
+    let yr: f64 = primrs.red.y;
+    let xg: f64 = primrs.green.x;
+    let yg: f64 = primrs.green.y;
+    let xb: f64 = primrs.blue.x;
+    let yb: f64 = primrs.blue.y;
+    primaries.m[0][0] = xr as f32;
+    primaries.m[0][1] = xg as f32;
+    primaries.m[0][2] = xb as f32;
+    primaries.m[1][0] = yr as f32;
+    primaries.m[1][1] = yg as f32;
+    primaries.m[1][2] = yb as f32;
+    primaries.m[2][0] = (1f64 - xr - yr) as f32;
+    primaries.m[2][1] = (1f64 - xg - yg) as f32;
+    primaries.m[2][2] = (1f64 - xb - yb) as f32;
+    white_point.v[0] = (xn / yn) as f32;
+    white_point.v[1] = 1.;
+    white_point.v[2] = ((1.0f64 - xn - yn) / yn) as f32;
+    let primaries_invert: Matrix = primaries.invert()?;
+
+    let coefs: Vector = primaries_invert.eval(white_point);
+    result.m[0][0] = (coefs.v[0] as f64 * xr) as f32;
+    result.m[0][1] = (coefs.v[1] as f64 * xg) as f32;
+    result.m[0][2] = (coefs.v[2] as f64 * xb) as f32;
+    result.m[1][0] = (coefs.v[0] as f64 * yr) as f32;
+    result.m[1][1] = (coefs.v[1] as f64 * yg) as f32;
+    result.m[1][2] = (coefs.v[2] as f64 * yb) as f32;
+    result.m[2][0] = (coefs.v[0] as f64 * (1.0f64 - xr - yr)) as f32;
+    result.m[2][1] = (coefs.v[1] as f64 * (1.0f64 - xg - yg)) as f32;
+    result.m[2][2] = (coefs.v[2] as f64 * (1.0f64 - xb - yb)) as f32;
+    Some(result)
+}
+/* CIE Illuminant D50 */
+const D50_XYZ: CIE_XYZ = CIE_XYZ {
+    X: 0.9642f64,
+    Y: 1.0000f64,
+    Z: 0.8249f64,
+};
+/* from lcms: xyY2XYZ()
+ * corresponds to argyll: icmYxy2XYZ() */
+fn xyY2XYZ(source: qcms_CIE_xyY) -> CIE_XYZ {
+    let mut dest: CIE_XYZ = CIE_XYZ {
+        X: 0.,
+        Y: 0.,
+        Z: 0.,
+    };
+    dest.X = source.x / source.y * source.Y;
+    dest.Y = source.Y;
+    dest.Z = (1f64 - source.x - source.y) / source.y * source.Y;
+    dest
+}
+/* from lcms: ComputeChromaticAdaption */
+// Compute chromatic adaption matrix using chad as cone matrix
+fn compute_chromatic_adaption(
+    source_white_point: CIE_XYZ,
+    dest_white_point: CIE_XYZ,
+    chad: Matrix,
+) -> Option<Matrix> {
+    let mut cone_source_XYZ: Vector = Vector { v: [0.; 3] };
+
+    let mut cone_dest_XYZ: Vector = Vector { v: [0.; 3] };
+
+    let mut cone: Matrix = Matrix { m: [[0.; 3]; 3] };
+
+    let chad_inv: Matrix = chad.invert()?;
+    cone_source_XYZ.v[0] = source_white_point.X as f32;
+    cone_source_XYZ.v[1] = source_white_point.Y as f32;
+    cone_source_XYZ.v[2] = source_white_point.Z as f32;
+    cone_dest_XYZ.v[0] = dest_white_point.X as f32;
+    cone_dest_XYZ.v[1] = dest_white_point.Y as f32;
+    cone_dest_XYZ.v[2] = dest_white_point.Z as f32;
+
+    let cone_source_rgb: Vector = chad.eval(cone_source_XYZ);
+    let cone_dest_rgb: Vector = chad.eval(cone_dest_XYZ);
+    cone.m[0][0] = cone_dest_rgb.v[0] / cone_source_rgb.v[0];
+    cone.m[0][1] = 0.;
+    cone.m[0][2] = 0.;
+    cone.m[1][0] = 0.;
+    cone.m[1][1] = cone_dest_rgb.v[1] / cone_source_rgb.v[1];
+    cone.m[1][2] = 0.;
+    cone.m[2][0] = 0.;
+    cone.m[2][1] = 0.;
+    cone.m[2][2] = cone_dest_rgb.v[2] / cone_source_rgb.v[2];
+    // Normalize
+    Some(Matrix::multiply(chad_inv, Matrix::multiply(cone, chad)))
+}
+/* from lcms: cmsAdaptionMatrix */
+// Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
+// Bradford is assumed
+fn adaption_matrix(source_illumination: CIE_XYZ, target_illumination: CIE_XYZ) -> Option<Matrix> {
+    let lam_rigg: Matrix = {
+        Matrix {
+            m: [
+                [0.8951, 0.2664, -0.1614],
+                [-0.7502, 1.7135, 0.0367],
+                [0.0389, -0.0685, 1.0296],
+            ],
+        }
+    };
+    compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg)
+}
+/* from lcms: cmsAdaptMatrixToD50 */
+fn adapt_matrix_to_D50(r: Option<Matrix>, source_white_pt: qcms_CIE_xyY) -> Option<Matrix> {
+    if source_white_pt.y == 0.0f64 {
+        return None;
+    }
+
+    let Dn: CIE_XYZ = xyY2XYZ(source_white_pt);
+    let Bradford: Matrix = adaption_matrix(Dn, D50_XYZ)?;
+    Some(Matrix::multiply(Bradford, r?))
+}
+pub(crate) fn set_rgb_colorants(
+    mut profile: &mut Profile,
+    white_point: qcms_CIE_xyY,
+    primaries: qcms_CIE_xyYTRIPLE,
+) -> bool {
+    let colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
+    let colorants = match adapt_matrix_to_D50(colorants, white_point) {
+        Some(colorants) => colorants,
+        None => return false,
+    };
+
+    /* note: there's a transpose type of operation going on here */
+    profile.redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0] as f64);
+    profile.redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0] as f64);
+    profile.redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0] as f64);
+    profile.greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1] as f64);
+    profile.greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1] as f64);
+    profile.greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1] as f64);
+    profile.blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2] as f64);
+    profile.blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2] as f64);
+    profile.blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2] as f64);
+    true
+}
+pub(crate) fn get_rgb_colorants(
+    white_point: qcms_CIE_xyY,
+    primaries: qcms_CIE_xyYTRIPLE,
+) -> Option<Matrix> {
+    let colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
+    adapt_matrix_to_D50(colorants, white_point)
+}
+/* Alpha is not corrected.
+   A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
+   RGB Is?" Tech Memo 17 (December 14, 1998).
+    See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
+*/
+unsafe extern "C" fn qcms_transform_data_gray_template_lut<I: GrayFormat, F: Format>(
+    transform: &qcms_transform,
+    mut src: *const u8,
+    mut dest: *mut u8,
+    length: usize,
+) {
+    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
+    let input_gamma_table_gray = transform.input_gamma_table_gray.as_ref().unwrap();
+
+    let mut i: u32 = 0;
+    while (i as usize) < length {
+        let fresh0 = src;
+        src = src.offset(1);
+        let device: u8 = *fresh0;
+        let mut alpha: u8 = 0xffu8;
+        if I::has_alpha {
+            let fresh1 = src;
+            src = src.offset(1);
+            alpha = *fresh1
+        }
+        let linear: f32 = input_gamma_table_gray[device as usize];
+
+        let out_device_r: f32 = lut_interp_linear(
+            linear as f64,
+            &(*transform).output_gamma_lut_r.as_ref().unwrap(),
+        );
+        let out_device_g: f32 = lut_interp_linear(
+            linear as f64,
+            &(*transform).output_gamma_lut_g.as_ref().unwrap(),
+        );
+        let out_device_b: f32 = lut_interp_linear(
+            linear as f64,
+            &(*transform).output_gamma_lut_b.as_ref().unwrap(),
+        );
+        *dest.add(F::kRIndex) = clamp_u8(out_device_r * 255f32);
+        *dest.add(F::kGIndex) = clamp_u8(out_device_g * 255f32);
+        *dest.add(F::kBIndex) = clamp_u8(out_device_b * 255f32);
+        if F::kAIndex != 0xff {
+            *dest.add(F::kAIndex) = alpha
+        }
+        dest = dest.offset(components as isize);
+        i += 1
+    }
+}
+unsafe fn qcms_transform_data_gray_out_lut(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_lut::<Gray, RGB>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_gray_rgba_out_lut(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_lut::<Gray, RGBA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_gray_bgra_out_lut(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_lut::<Gray, BGRA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_graya_rgba_out_lut(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_lut::<GrayAlpha, RGBA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_graya_bgra_out_lut(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_lut::<GrayAlpha, BGRA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_gray_template_precache<I: GrayFormat, F: Format>(
+    transform: *const qcms_transform,
+    mut src: *const u8,
+    mut dest: *mut u8,
+    length: usize,
+) {
+    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
+    let output_table_r = ((*transform).output_table_r).as_deref().unwrap();
+    let output_table_g = ((*transform).output_table_g).as_deref().unwrap();
+    let output_table_b = ((*transform).output_table_b).as_deref().unwrap();
+
+    let input_gamma_table_gray = (*transform)
+        .input_gamma_table_gray
+        .as_ref()
+        .unwrap()
+        .as_ptr();
+
+    let mut i: u32 = 0;
+    while (i as usize) < length {
+        let fresh2 = src;
+        src = src.offset(1);
+        let device: u8 = *fresh2;
+        let mut alpha: u8 = 0xffu8;
+        if I::has_alpha {
+            let fresh3 = src;
+            src = src.offset(1);
+            alpha = *fresh3
+        }
+
+        let linear: f32 = *input_gamma_table_gray.offset(device as isize);
+        /* we could round here... */
+        let gray: u16 = (linear * PRECACHE_OUTPUT_MAX as f32) as u16;
+        *dest.add(F::kRIndex) = (output_table_r).data[gray as usize];
+        *dest.add(F::kGIndex) = (output_table_g).data[gray as usize];
+        *dest.add(F::kBIndex) = (output_table_b).data[gray as usize];
+        if F::kAIndex != 0xff {
+            *dest.add(F::kAIndex) = alpha
+        }
+        dest = dest.offset(components as isize);
+        i += 1
+    }
+}
+unsafe fn qcms_transform_data_gray_out_precache(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_precache::<Gray, RGB>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_gray_rgba_out_precache(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_precache::<Gray, RGBA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_gray_bgra_out_precache(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_precache::<Gray, BGRA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_graya_rgba_out_precache(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_precache::<GrayAlpha, RGBA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_graya_bgra_out_precache(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_gray_template_precache::<GrayAlpha, BGRA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_template_lut_precache<F: Format>(
+    transform: &qcms_transform,
+    mut src: *const u8,
+    mut dest: *mut u8,
+    length: usize,
+) {
+    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
+    let output_table_r = ((*transform).output_table_r).as_deref().unwrap();
+    let output_table_g = ((*transform).output_table_g).as_deref().unwrap();
+    let output_table_b = ((*transform).output_table_b).as_deref().unwrap();
+    let input_gamma_table_r = (*transform).input_gamma_table_r.as_ref().unwrap().as_ptr();
+    let input_gamma_table_g = (*transform).input_gamma_table_g.as_ref().unwrap().as_ptr();
+    let input_gamma_table_b = (*transform).input_gamma_table_b.as_ref().unwrap().as_ptr();
+
+    let mat = &transform.matrix;
+    let mut i: u32 = 0;
+    while (i as usize) < length {
+        let device_r: u8 = *src.add(F::kRIndex);
+        let device_g: u8 = *src.add(F::kGIndex);
+        let device_b: u8 = *src.add(F::kBIndex);
+        let mut alpha: u8 = 0;
+        if F::kAIndex != 0xff {
+            alpha = *src.add(F::kAIndex)
+        }
+        src = src.offset(components as isize);
+
+        let linear_r: f32 = *input_gamma_table_r.offset(device_r as isize);
+        let linear_g: f32 = *input_gamma_table_g.offset(device_g as isize);
+        let linear_b: f32 = *input_gamma_table_b.offset(device_b as isize);
+        let mut out_linear_r = mat[0][0] * linear_r + mat[1][0] * linear_g + mat[2][0] * linear_b;
+        let mut out_linear_g = mat[0][1] * linear_r + mat[1][1] * linear_g + mat[2][1] * linear_b;
+        let mut out_linear_b = mat[0][2] * linear_r + mat[1][2] * linear_g + mat[2][2] * linear_b;
+        out_linear_r = clamp_float(out_linear_r);
+        out_linear_g = clamp_float(out_linear_g);
+        out_linear_b = clamp_float(out_linear_b);
+        /* we could round here... */
+
+        let r: u16 = (out_linear_r * PRECACHE_OUTPUT_MAX as f32) as u16;
+        let g: u16 = (out_linear_g * PRECACHE_OUTPUT_MAX as f32) as u16;
+        let b: u16 = (out_linear_b * PRECACHE_OUTPUT_MAX as f32) as u16;
+        *dest.add(F::kRIndex) = (output_table_r).data[r as usize];
+        *dest.add(F::kGIndex) = (output_table_g).data[g as usize];
+        *dest.add(F::kBIndex) = (output_table_b).data[b as usize];
+        if F::kAIndex != 0xff {
+            *dest.add(F::kAIndex) = alpha
+        }
+        dest = dest.offset(components as isize);
+        i += 1
+    }
+}
+#[no_mangle]
+pub unsafe fn qcms_transform_data_rgb_out_lut_precache(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_precache::<RGB>(transform, src, dest, length);
+}
+#[no_mangle]
+pub unsafe fn qcms_transform_data_rgba_out_lut_precache(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_precache::<RGBA>(transform, src, dest, length);
+}
+#[no_mangle]
+pub unsafe fn qcms_transform_data_bgra_out_lut_precache(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_precache::<BGRA>(transform, src, dest, length);
+}
+// Not used
+/*
+static void qcms_transform_data_clut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
+    unsigned int i;
+    int xy_len = 1;
+    int x_len = transform->grid_size;
+    int len = x_len * x_len;
+    const float* r_table = transform->r_clut;
+    const float* g_table = transform->g_clut;
+    const float* b_table = transform->b_clut;
+
+    for (i = 0; i < length; i++) {
+        unsigned char in_r = *src++;
+        unsigned char in_g = *src++;
+        unsigned char in_b = *src++;
+        float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
+
+        int x = floorf(linear_r * (transform->grid_size-1));
+        int y = floorf(linear_g * (transform->grid_size-1));
+        int z = floorf(linear_b * (transform->grid_size-1));
+        int x_n = ceilf(linear_r * (transform->grid_size-1));
+        int y_n = ceilf(linear_g * (transform->grid_size-1));
+        int z_n = ceilf(linear_b * (transform->grid_size-1));
+        float x_d = linear_r * (transform->grid_size-1) - x;
+        float y_d = linear_g * (transform->grid_size-1) - y;
+        float z_d = linear_b * (transform->grid_size-1) - z;
+
+        float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
+        float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
+        float r_y1 = lerp(r_x1, r_x2, y_d);
+        float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
+        float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
+        float r_y2 = lerp(r_x3, r_x4, y_d);
+        float clut_r = lerp(r_y1, r_y2, z_d);
+
+        float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
+        float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
+        float g_y1 = lerp(g_x1, g_x2, y_d);
+        float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
+        float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
+        float g_y2 = lerp(g_x3, g_x4, y_d);
+        float clut_g = lerp(g_y1, g_y2, z_d);
+
+        float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
+        float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
+        float b_y1 = lerp(b_x1, b_x2, y_d);
+        float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
+        float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
+        float b_y2 = lerp(b_x3, b_x4, y_d);
+        float clut_b = lerp(b_y1, b_y2, z_d);
+
+        *dest++ = clamp_u8(clut_r*255.0f);
+        *dest++ = clamp_u8(clut_g*255.0f);
+        *dest++ = clamp_u8(clut_b*255.0f);
+    }
+}
+*/
+fn int_div_ceil(value: i32, div: i32) -> i32 {
+    (value + div - 1) / div
+}
+// Using lcms' tetra interpolation algorithm.
+unsafe extern "C" fn qcms_transform_data_tetra_clut_template<F: Format>(
+    transform: *const qcms_transform,
+    mut src: *const u8,
+    mut dest: *mut u8,
+    length: usize,
+) {
+    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
+
+    let xy_len: i32 = 1;
+    let x_len: i32 = (*transform).grid_size as i32;
+    let len: i32 = x_len * x_len;
+    let table = (*transform).clut.as_ref().unwrap().as_ptr();
+    let r_table: *const f32 = table;
+    let g_table: *const f32 = table.offset(1);
+    let b_table: *const f32 = table.offset(2);
+
+    let mut i: u32 = 0;
+    while (i as usize) < length {
+        let c0_r: f32;
+        let c1_r: f32;
+        let c2_r: f32;
+        let c3_r: f32;
+        let c0_g: f32;
+        let c1_g: f32;
+        let c2_g: f32;
+        let c3_g: f32;
+        let c0_b: f32;
+        let c1_b: f32;
+        let c2_b: f32;
+        let c3_b: f32;
+        let in_r: u8 = *src.add(F::kRIndex);
+        let in_g: u8 = *src.add(F::kGIndex);
+        let in_b: u8 = *src.add(F::kBIndex);
+        let mut in_a: u8 = 0;
+        if F::kAIndex != 0xff {
+            in_a = *src.add(F::kAIndex)
+        }
+        src = src.offset(components as isize);
+        let linear_r: f32 = in_r as i32 as f32 / 255.0;
+        let linear_g: f32 = in_g as i32 as f32 / 255.0;
+        let linear_b: f32 = in_b as i32 as f32 / 255.0;
+        let x: i32 = in_r as i32 * ((*transform).grid_size as i32 - 1) / 255;
+        let y: i32 = in_g as i32 * ((*transform).grid_size as i32 - 1) / 255;
+        let z: i32 = in_b as i32 * ((*transform).grid_size as i32 - 1) / 255;
+        let x_n: i32 = int_div_ceil(in_r as i32 * ((*transform).grid_size as i32 - 1), 255);
+        let y_n: i32 = int_div_ceil(in_g as i32 * ((*transform).grid_size as i32 - 1), 255);
+        let z_n: i32 = int_div_ceil(in_b as i32 * ((*transform).grid_size as i32 - 1), 255);
+        let rx: f32 = linear_r * ((*transform).grid_size as i32 - 1) as f32 - x as f32;
+        let ry: f32 = linear_g * ((*transform).grid_size as i32 - 1) as f32 - y as f32;
+        let rz: f32 = linear_b * ((*transform).grid_size as i32 - 1) as f32 - z as f32;
+        let CLU = |table: *const f32, x, y, z| {
+            *table.offset(((x * len + y * x_len + z * xy_len) * 3) as isize)
+        };
+
+        c0_r = CLU(r_table, x, y, z);
+        c0_g = CLU(g_table, x, y, z);
+        c0_b = CLU(b_table, x, y, z);
+        if rx >= ry {
+            if ry >= rz {
+                //rx >= ry && ry >= rz
+                c1_r = CLU(r_table, x_n, y, z) - c0_r;
+                c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
+                c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+                c1_g = CLU(g_table, x_n, y, z) - c0_g;
+                c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
+                c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+                c1_b = CLU(b_table, x_n, y, z) - c0_b;
+                c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
+                c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+            } else if rx >= rz {
+                //rx >= rz && rz >= ry
+                c1_r = CLU(r_table, x_n, y, z) - c0_r;
+                c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+                c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
+                c1_g = CLU(g_table, x_n, y, z) - c0_g;
+                c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+                c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
+                c1_b = CLU(b_table, x_n, y, z) - c0_b;
+                c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+                c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
+            } else {
+                //rz > rx && rx >= ry
+                c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
+                c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+                c3_r = CLU(r_table, x, y, z_n) - c0_r;
+                c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
+                c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+                c3_g = CLU(g_table, x, y, z_n) - c0_g;
+                c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
+                c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+                c3_b = CLU(b_table, x, y, z_n) - c0_b;
+            }
+        } else if rx >= rz {
+            //ry > rx && rx >= rz
+            c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
+            c2_r = CLU(r_table, x, y_n, z) - c0_r;
+            c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+            c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
+            c2_g = CLU(g_table, x, y_n, z) - c0_g;
+            c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+            c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
+            c2_b = CLU(b_table, x, y_n, z) - c0_b;
+            c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+        } else if ry >= rz {
+            //ry >= rz && rz > rx
+            c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+            c2_r = CLU(r_table, x, y_n, z) - c0_r;
+            c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
+            c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+            c2_g = CLU(g_table, x, y_n, z) - c0_g;
+            c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
+            c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+            c2_b = CLU(b_table, x, y_n, z) - c0_b;
+            c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
+        } else {
+            //rz > ry && ry > rx
+            c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+            c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
+            c3_r = CLU(r_table, x, y, z_n) - c0_r;
+            c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+            c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
+            c3_g = CLU(g_table, x, y, z_n) - c0_g;
+            c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+            c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
+            c3_b = CLU(b_table, x, y, z_n) - c0_b;
+        }
+        let clut_r = c0_r + c1_r * rx + c2_r * ry + c3_r * rz;
+        let clut_g = c0_g + c1_g * rx + c2_g * ry + c3_g * rz;
+        let clut_b = c0_b + c1_b * rx + c2_b * ry + c3_b * rz;
+        *dest.add(F::kRIndex) = clamp_u8(clut_r * 255.0);
+        *dest.add(F::kGIndex) = clamp_u8(clut_g * 255.0);
+        *dest.add(F::kBIndex) = clamp_u8(clut_b * 255.0);
+        if F::kAIndex != 0xff {
+            *dest.add(F::kAIndex) = in_a
+        }
+        dest = dest.offset(components as isize);
+        i += 1
+    }
+}
+
+unsafe fn tetra(
+    transform: &qcms_transform,
+    table: *const f32,
+    in_r: u8,
+    in_g: u8,
+    in_b: u8,
+) -> (f32, f32, f32) {
+    let r_table: *const f32 = table;
+    let g_table: *const f32 = table.offset(1);
+    let b_table: *const f32 = table.offset(2);
+    let linear_r: f32 = in_r as i32 as f32 / 255.0;
+    let linear_g: f32 = in_g as i32 as f32 / 255.0;
+    let linear_b: f32 = in_b as i32 as f32 / 255.0;
+    let xy_len: i32 = 1;
+    let x_len: i32 = (*transform).grid_size as i32;
+    let len: i32 = x_len * x_len;
+    let x: i32 = in_r as i32 * ((*transform).grid_size as i32 - 1) / 255;
+    let y: i32 = in_g as i32 * ((*transform).grid_size as i32 - 1) / 255;
+    let z: i32 = in_b as i32 * ((*transform).grid_size as i32 - 1) / 255;
+    let x_n: i32 = int_div_ceil(in_r as i32 * ((*transform).grid_size as i32 - 1), 255);
+    let y_n: i32 = int_div_ceil(in_g as i32 * ((*transform).grid_size as i32 - 1), 255);
+    let z_n: i32 = int_div_ceil(in_b as i32 * ((*transform).grid_size as i32 - 1), 255);
+    let rx: f32 = linear_r * ((*transform).grid_size as i32 - 1) as f32 - x as f32;
+    let ry: f32 = linear_g * ((*transform).grid_size as i32 - 1) as f32 - y as f32;
+    let rz: f32 = linear_b * ((*transform).grid_size as i32 - 1) as f32 - z as f32;
+    let CLU = |table: *const f32, x, y, z| {
+        *table.offset(((x * len + y * x_len + z * xy_len) * 3) as isize)
+    };
+    let c0_r: f32;
+    let c1_r: f32;
+    let c2_r: f32;
+    let c3_r: f32;
+    let c0_g: f32;
+    let c1_g: f32;
+    let c2_g: f32;
+    let c3_g: f32;
+    let c0_b: f32;
+    let c1_b: f32;
+    let c2_b: f32;
+    let c3_b: f32;
+    c0_r = CLU(r_table, x, y, z);
+    c0_g = CLU(g_table, x, y, z);
+    c0_b = CLU(b_table, x, y, z);
+    if rx >= ry {
+        if ry >= rz {
+            //rx >= ry && ry >= rz
+            c1_r = CLU(r_table, x_n, y, z) - c0_r;
+            c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
+            c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+            c1_g = CLU(g_table, x_n, y, z) - c0_g;
+            c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
+            c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+            c1_b = CLU(b_table, x_n, y, z) - c0_b;
+            c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
+            c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+        } else if rx >= rz {
+            //rx >= rz && rz >= ry
+            c1_r = CLU(r_table, x_n, y, z) - c0_r;
+            c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+            c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
+            c1_g = CLU(g_table, x_n, y, z) - c0_g;
+            c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+            c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
+            c1_b = CLU(b_table, x_n, y, z) - c0_b;
+            c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+            c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
+        } else {
+            //rz > rx && rx >= ry
+            c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
+            c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+            c3_r = CLU(r_table, x, y, z_n) - c0_r;
+            c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
+            c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+            c3_g = CLU(g_table, x, y, z_n) - c0_g;
+            c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
+            c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+            c3_b = CLU(b_table, x, y, z_n) - c0_b;
+        }
+    } else if rx >= rz {
+        //ry > rx && rx >= rz
+        c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
+        c2_r = CLU(r_table, x, y_n, z) - c0_r;
+        c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+        c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
+        c2_g = CLU(g_table, x, y_n, z) - c0_g;
+        c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+        c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
+        c2_b = CLU(b_table, x, y_n, z) - c0_b;
+        c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+    } else if ry >= rz {
+        //ry >= rz && rz > rx
+        c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+        c2_r = CLU(r_table, x, y_n, z) - c0_r;
+        c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
+        c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+        c2_g = CLU(g_table, x, y_n, z) - c0_g;
+        c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
+        c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+        c2_b = CLU(b_table, x, y_n, z) - c0_b;
+        c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
+    } else {
+        //rz > ry && ry > rx
+        c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+        c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
+        c3_r = CLU(r_table, x, y, z_n) - c0_r;
+        c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+        c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
+        c3_g = CLU(g_table, x, y, z_n) - c0_g;
+        c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+        c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
+        c3_b = CLU(b_table, x, y, z_n) - c0_b;
+    }
+    let clut_r = c0_r + c1_r * rx + c2_r * ry + c3_r * rz;
+    let clut_g = c0_g + c1_g * rx + c2_g * ry + c3_g * rz;
+    let clut_b = c0_b + c1_b * rx + c2_b * ry + c3_b * rz;
+    (clut_r, clut_g, clut_b)
+}
+
+#[inline]
+fn lerp(a: f32, b: f32, t: f32) -> f32 {
+    a * (1.0 - t) + b * t
+}
+
+// lerp between two tetrahedral interpolations
+// See lcms:Eval4InputsFloat
+#[allow(clippy::many_single_char_names)]
+unsafe fn qcms_transform_data_tetra_clut_cmyk(
+    transform: &qcms_transform,
+    mut src: *const u8,
+    mut dest: *mut u8,
+    length: usize,
+) {
+    let table = (*transform).clut.as_ref().unwrap().as_ptr();
+    assert!(
+        (*transform).clut.as_ref().unwrap().len()
+            >= ((transform.grid_size as i32).pow(4) * 3) as usize
+    );
+    for _ in 0..length {
+        let c: u8 = *src.add(0);
+        let m: u8 = *src.add(1);
+        let y: u8 = *src.add(2);
+        let k: u8 = *src.add(3);
+        src = src.offset(4);
+        let linear_k: f32 = k as i32 as f32 / 255.0;
+        let grid_size = (*transform).grid_size as i32;
+        let w: i32 = k as i32 * ((*transform).grid_size as i32 - 1) / 255;
+        let w_n: i32 = int_div_ceil(k as i32 * ((*transform).grid_size as i32 - 1), 255);
+        let t: f32 = linear_k * ((*transform).grid_size as i32 - 1) as f32 - w as f32;
+
+        let table1 = table.offset((w * grid_size * grid_size * grid_size * 3) as isize);
+        let table2 = table.offset((w_n * grid_size * grid_size * grid_size * 3) as isize);
+
+        let (r1, g1, b1) = tetra(transform, table1, c, m, y);
+        let (r2, g2, b2) = tetra(transform, table2, c, m, y);
+        let r = lerp(r1, r2, t);
+        let g = lerp(g1, g2, t);
+        let b = lerp(b1, b2, t);
+        *dest.add(0) = clamp_u8(r * 255.0);
+        *dest.add(1) = clamp_u8(g * 255.0);
+        *dest.add(2) = clamp_u8(b * 255.0);
+        dest = dest.offset(3);
+    }
+}
+
+unsafe fn qcms_transform_data_tetra_clut_rgb(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_tetra_clut_template::<RGB>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_tetra_clut_rgba(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_tetra_clut_template::<RGBA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_tetra_clut_bgra(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_tetra_clut_template::<BGRA>(transform, src, dest, length);
+}
+unsafe fn qcms_transform_data_template_lut<F: Format>(
+    transform: &qcms_transform,
+    mut src: *const u8,
+    mut dest: *mut u8,
+    length: usize,
+) {
+    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
+
+    let mat = &transform.matrix;
+    let mut i: u32 = 0;
+    let input_gamma_table_r = (*transform).input_gamma_table_r.as_ref().unwrap().as_ptr();
+    let input_gamma_table_g = (*transform).input_gamma_table_g.as_ref().unwrap().as_ptr();
+    let input_gamma_table_b = (*transform).input_gamma_table_b.as_ref().unwrap().as_ptr();
+    while (i as usize) < length {
+        let device_r: u8 = *src.add(F::kRIndex);
+        let device_g: u8 = *src.add(F::kGIndex);
+        let device_b: u8 = *src.add(F::kBIndex);
+        let mut alpha: u8 = 0;
+        if F::kAIndex != 0xff {
+            alpha = *src.add(F::kAIndex)
+        }
+        src = src.offset(components as isize);
+
+        let linear_r: f32 = *input_gamma_table_r.offset(device_r as isize);
+        let linear_g: f32 = *input_gamma_table_g.offset(device_g as isize);
+        let linear_b: f32 = *input_gamma_table_b.offset(device_b as isize);
+        let mut out_linear_r = mat[0][0] * linear_r + mat[1][0] * linear_g + mat[2][0] * linear_b;
+        let mut out_linear_g = mat[0][1] * linear_r + mat[1][1] * linear_g + mat[2][1] * linear_b;
+        let mut out_linear_b = mat[0][2] * linear_r + mat[1][2] * linear_g + mat[2][2] * linear_b;
+        out_linear_r = clamp_float(out_linear_r);
+        out_linear_g = clamp_float(out_linear_g);
+        out_linear_b = clamp_float(out_linear_b);
+
+        let out_device_r: f32 = lut_interp_linear(
+            out_linear_r as f64,
+            &(*transform).output_gamma_lut_r.as_ref().unwrap(),
+        );
+        let out_device_g: f32 = lut_interp_linear(
+            out_linear_g as f64,
+            (*transform).output_gamma_lut_g.as_ref().unwrap(),
+        );
+        let out_device_b: f32 = lut_interp_linear(
+            out_linear_b as f64,
+            (*transform).output_gamma_lut_b.as_ref().unwrap(),
+        );
+        *dest.add(F::kRIndex) = clamp_u8(out_device_r * 255f32);
+        *dest.add(F::kGIndex) = clamp_u8(out_device_g * 255f32);
+        *dest.add(F::kBIndex) = clamp_u8(out_device_b * 255f32);
+        if F::kAIndex != 0xff {
+            *dest.add(F::kAIndex) = alpha
+        }
+        dest = dest.offset(components as isize);
+        i += 1
+    }
+}
+#[no_mangle]
+pub unsafe fn qcms_transform_data_rgb_out_lut(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut::<RGB>(transform, src, dest, length);
+}
+#[no_mangle]
+pub unsafe fn qcms_transform_data_rgba_out_lut(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut::<RGBA>(transform, src, dest, length);
+}
+#[no_mangle]
+pub unsafe fn qcms_transform_data_bgra_out_lut(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut::<BGRA>(transform, src, dest, length);
+}
+
+fn precache_create() -> Arc<PrecacheOuput> {
+    Arc::new(PrecacheOuput::default())
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn qcms_transform_release(t: *mut qcms_transform) {
+    drop(Box::from_raw(t));
+}
+
+const bradford_matrix: Matrix = Matrix {
+    m: [
+        [0.8951, 0.2664, -0.1614],
+        [-0.7502, 1.7135, 0.0367],
+        [0.0389, -0.0685, 1.0296],
+    ],
+};
+
+const bradford_matrix_inv: Matrix = Matrix {
+    m: [
+        [0.9869929, -0.1470543, 0.1599627],
+        [0.4323053, 0.5183603, 0.0492912],
+        [-0.0085287, 0.0400428, 0.9684867],
+    ],
+};
+
+// See ICCv4 E.3
+fn compute_whitepoint_adaption(X: f32, Y: f32, Z: f32) -> Matrix {
+    let p: f32 = (0.96422 * bradford_matrix.m[0][0]
+        + 1.000 * bradford_matrix.m[1][0]
+        + 0.82521 * bradford_matrix.m[2][0])
+        / (X * bradford_matrix.m[0][0] + Y * bradford_matrix.m[1][0] + Z * bradford_matrix.m[2][0]);
+    let y: f32 = (0.96422 * bradford_matrix.m[0][1]
+        + 1.000 * bradford_matrix.m[1][1]
+        + 0.82521 * bradford_matrix.m[2][1])
+        / (X * bradford_matrix.m[0][1] + Y * bradford_matrix.m[1][1] + Z * bradford_matrix.m[2][1]);
+    let b: f32 = (0.96422 * bradford_matrix.m[0][2]
+        + 1.000 * bradford_matrix.m[1][2]
+        + 0.82521 * bradford_matrix.m[2][2])
+        / (X * bradford_matrix.m[0][2] + Y * bradford_matrix.m[1][2] + Z * bradford_matrix.m[2][2]);
+    let white_adaption = Matrix {
+        m: [[p, 0., 0.], [0., y, 0.], [0., 0., b]],
+    };
+    Matrix::multiply(
+        bradford_matrix_inv,
+        Matrix::multiply(white_adaption, bradford_matrix),
+    )
+}
+#[no_mangle]
+pub extern "C" fn qcms_profile_precache_output_transform(mut profile: &mut Profile) {
+    /* we only support precaching on rgb profiles */
+    if profile.color_space != RGB_SIGNATURE {
+        return;
+    }
+    if SUPPORTS_ICCV4.load(Ordering::Relaxed) {
+        /* don't precache since we will use the B2A LUT */
+        if profile.B2A0.is_some() {
+            return;
+        }
+        /* don't precache since we will use the mBA LUT */
+        if profile.mBA.is_some() {
+            return;
+        }
+    }
+    /* don't precache if we do not have the TRC curves */
+    if profile.redTRC.is_none() || profile.greenTRC.is_none() || profile.blueTRC.is_none() {
+        return;
+    }
+    if profile.output_table_r.is_none() {
+        let mut output_table_r = precache_create();
+        if compute_precache(
+            profile.redTRC.as_deref().unwrap(),
+            &mut Arc::get_mut(&mut output_table_r).unwrap().data,
+        ) {
+            profile.output_table_r = Some(output_table_r);
+        }
+    }
+    if profile.output_table_g.is_none() {
+        let mut output_table_g = precache_create();
+        if compute_precache(
+            profile.greenTRC.as_deref().unwrap(),
+            &mut Arc::get_mut(&mut output_table_g).unwrap().data,
+        ) {
+            profile.output_table_g = Some(output_table_g);
+        }
+    }
+    if profile.output_table_b.is_none() {
+        let mut output_table_b = precache_create();
+        if compute_precache(
+            profile.blueTRC.as_deref().unwrap(),
+            &mut Arc::get_mut(&mut output_table_b).unwrap().data,
+        ) {
+            profile.output_table_b = Some(output_table_b);
+        }
+    };
+}
+/* Replace the current transformation with a LUT transformation using a given number of sample points */
+fn transform_precacheLUT_float(
+    mut transform: Box<qcms_transform>,
+    input: &Profile,
+    output: &Profile,
+    samples: i32,
+    in_type: DataType,
+) -> Option<Box<qcms_transform>> {
+    /* The range between which 2 consecutive sample points can be used to interpolate */
+    let lutSize: u32 = (3 * samples * samples * samples) as u32;
+
+    let mut src = Vec::with_capacity(lutSize as usize);
+    let dest = vec![0.; lutSize as usize];
+    /* Prepare a list of points we want to sample */
+    for x in 0..samples {
+        for y in 0..samples {
+            for z in 0..samples {
+                src.push(x as f32 / (samples - 1) as f32);
+                src.push(y as f32 / (samples - 1) as f32);
+                src.push(z as f32 / (samples - 1) as f32);
+            }
+        }
+    }
+    let lut = chain_transform(input, output, src, dest, lutSize as usize);
+    if let Some(lut) = lut {
+        (*transform).clut = Some(lut);
+        (*transform).grid_size = samples as u16;
+        if in_type == RGBA8 {
+            (*transform).transform_fn = Some(qcms_transform_data_tetra_clut_rgba)
+        } else if in_type == BGRA8 {
+            (*transform).transform_fn = Some(qcms_transform_data_tetra_clut_bgra)
+        } else if in_type == RGB8 {
+            (*transform).transform_fn = Some(qcms_transform_data_tetra_clut_rgb)
+        }
+        debug_assert!((*transform).transform_fn.is_some());
+    } else {
+        return None;
+    }
+
+    Some(transform)
+}
+
+fn transform_precacheLUT_cmyk_float(
+    mut transform: Box<qcms_transform>,
+    input: &Profile,
+    output: &Profile,
+    samples: i32,
+    in_type: DataType,
+) -> Option<Box<qcms_transform>> {
+    /* The range between which 2 consecutive sample points can be used to interpolate */
+    let lutSize: u32 = (4 * samples * samples * samples * samples) as u32;
+
+    let mut src = Vec::with_capacity(lutSize as usize);
+    let dest = vec![0.; lutSize as usize];
+    /* Prepare a list of points we want to sample */
+    for k in 0..samples {
+        for c in 0..samples {
+            for m in 0..samples {
+                for y in 0..samples {
+                    src.push(c as f32 / (samples - 1) as f32);
+                    src.push(m as f32 / (samples - 1) as f32);
+                    src.push(y as f32 / (samples - 1) as f32);
+                    src.push(k as f32 / (samples - 1) as f32);
+                }
+            }
+        }
+    }
+    let lut = chain_transform(input, output, src, dest, lutSize as usize);
+    if let Some(lut) = lut {
+        transform.clut = Some(lut);
+        transform.grid_size = samples as u16;
+        assert!(in_type == DataType::CMYK);
+        transform.transform_fn = Some(qcms_transform_data_tetra_clut_cmyk)
+    } else {
+        return None;
+    }
+
+    Some(transform)
+}
+
+pub fn transform_create(
+    input: &Profile,
+    in_type: DataType,
+    output: &Profile,
+    out_type: DataType,
+    _intent: Intent,
+) -> Option<Box<qcms_transform>> {
+    // Ensure the requested input and output types make sense.
+    let matching_format = match (in_type, out_type) {
+        (RGB8, RGB8) => true,
+        (RGBA8, RGBA8) => true,
+        (BGRA8, BGRA8) => true,
+        (Gray8, out_type) => matches!(out_type, RGB8 | RGBA8 | BGRA8),
+        (GrayA8, out_type) => matches!(out_type, RGBA8 | BGRA8),
+        (CMYK, RGB8) => true,
+        _ => false,
+    };
+    if !matching_format {
+        debug_assert!(false, "input/output type");
+        return None;
+    }
+    let mut transform: Box<qcms_transform> = Box::new(Default::default());
+    let mut precache: bool = false;
+    if output.output_table_r.is_some()
+        && output.output_table_g.is_some()
+        && output.output_table_b.is_some()
+    {
+        precache = true
+    }
+    // This precache assumes RGB_SIGNATURE (fails on GRAY_SIGNATURE, for instance)
+    if SUPPORTS_ICCV4.load(Ordering::Relaxed)
+        && (in_type == RGB8 || in_type == RGBA8 || in_type == BGRA8 || in_type == CMYK)
+        && (input.A2B0.is_some()
+            || output.B2A0.is_some()
+            || input.mAB.is_some()
+            || output.mAB.is_some())
+    {
+        if in_type == CMYK {
+            return transform_precacheLUT_cmyk_float(transform, input, output, 17, in_type);
+        }
+        // Precache the transformation to a CLUT 33x33x33 in size.
+        // 33 is used by many profiles and works well in pratice.
+        // This evenly divides 256 into blocks of 8x8x8.
+        // TODO For transforming small data sets of about 200x200 or less
+        // precaching should be avoided.
+        let result = transform_precacheLUT_float(transform, input, output, 33, in_type);
+        debug_assert!(result.is_some(), "precacheLUT failed");
+        return result;
+    }
+    if precache {
+        transform.output_table_r = Some(Arc::clone(output.output_table_r.as_ref().unwrap()));
+        transform.output_table_g = Some(Arc::clone(output.output_table_g.as_ref().unwrap()));
+        transform.output_table_b = Some(Arc::clone(output.output_table_b.as_ref().unwrap()));
+    } else {
+        if output.redTRC.is_none() || output.greenTRC.is_none() || output.blueTRC.is_none() {
+            return None;
+        }
+        transform.output_gamma_lut_r = build_output_lut(output.redTRC.as_deref().unwrap());
+        transform.output_gamma_lut_g = build_output_lut(output.greenTRC.as_deref().unwrap());
+        transform.output_gamma_lut_b = build_output_lut(output.blueTRC.as_deref().unwrap());
+
+        if transform.output_gamma_lut_r.is_none()
+            || transform.output_gamma_lut_g.is_none()
+            || transform.output_gamma_lut_b.is_none()
+        {
+            return None;
+        }
+    }
+    if input.color_space == RGB_SIGNATURE {
+        if precache {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            if is_x86_feature_detected!("avx") {
+                if in_type == RGB8 {
+                    transform.transform_fn = Some(qcms_transform_data_rgb_out_lut_avx)
+                } else if in_type == RGBA8 {
+                    transform.transform_fn = Some(qcms_transform_data_rgba_out_lut_avx)
+                } else if in_type == BGRA8 {
+                    transform.transform_fn = Some(qcms_transform_data_bgra_out_lut_avx)
+                }
+            } else if cfg!(not(miri)) && is_x86_feature_detected!("sse2") {
+                if in_type == RGB8 {
+                    transform.transform_fn = Some(qcms_transform_data_rgb_out_lut_sse2)
+                } else if in_type == RGBA8 {
+                    transform.transform_fn = Some(qcms_transform_data_rgba_out_lut_sse2)
+                } else if in_type == BGRA8 {
+                    transform.transform_fn = Some(qcms_transform_data_bgra_out_lut_sse2)
+                }
+            }
+
+            #[cfg(all(target_arch = "arm", feature = "neon"))]
+            let neon_supported = is_arm_feature_detected!("neon");
+            #[cfg(all(target_arch = "aarch64", feature = "neon"))]
+            let neon_supported = is_aarch64_feature_detected!("neon");
+
+            #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "neon"))]
+            if neon_supported {
+                if in_type == RGB8 {
+                    transform.transform_fn = Some(qcms_transform_data_rgb_out_lut_neon)
+                } else if in_type == RGBA8 {
+                    transform.transform_fn = Some(qcms_transform_data_rgba_out_lut_neon)
+                } else if in_type == BGRA8 {
+                    transform.transform_fn = Some(qcms_transform_data_bgra_out_lut_neon)
+                }
+            }
+
+            if transform.transform_fn.is_none() {
+                if in_type == RGB8 {
+                    transform.transform_fn = Some(qcms_transform_data_rgb_out_lut_precache)
+                } else if in_type == RGBA8 {
+                    transform.transform_fn = Some(qcms_transform_data_rgba_out_lut_precache)
+                } else if in_type == BGRA8 {
+                    transform.transform_fn = Some(qcms_transform_data_bgra_out_lut_precache)
+                }
+            }
+        } else if in_type == RGB8 {
+            transform.transform_fn = Some(qcms_transform_data_rgb_out_lut)
+        } else if in_type == RGBA8 {
+            transform.transform_fn = Some(qcms_transform_data_rgba_out_lut)
+        } else if in_type == BGRA8 {
+            transform.transform_fn = Some(qcms_transform_data_bgra_out_lut)
+        }
+        //XXX: avoid duplicating tables if we can
+        transform.input_gamma_table_r = build_input_gamma_table(input.redTRC.as_deref());
+        transform.input_gamma_table_g = build_input_gamma_table(input.greenTRC.as_deref());
+        transform.input_gamma_table_b = build_input_gamma_table(input.blueTRC.as_deref());
+        if transform.input_gamma_table_r.is_none()
+            || transform.input_gamma_table_g.is_none()
+            || transform.input_gamma_table_b.is_none()
+        {
+            return None;
+        }
+        /* build combined colorant matrix */
+
+        let in_matrix: Matrix = build_colorant_matrix(input);
+        let mut out_matrix: Matrix = build_colorant_matrix(output);
+        out_matrix = out_matrix.invert()?;
+
+        let result_0: Matrix = Matrix::multiply(out_matrix, in_matrix);
+        /* check for NaN values in the matrix and bail if we find any */
+        let mut i: u32 = 0;
+        while i < 3 {
+            let mut j: u32 = 0;
+            while j < 3 {
+                #[allow(clippy::eq_op, clippy::float_cmp)]
+                if result_0.m[i as usize][j as usize].is_nan() {
+                    return None;
+                }
+                j += 1
+            }
+            i += 1
+        }
+        /* store the results in column major mode
+         * this makes doing the multiplication with sse easier */
+        transform.matrix[0][0] = result_0.m[0][0];
+        transform.matrix[1][0] = result_0.m[0][1];
+        transform.matrix[2][0] = result_0.m[0][2];
+        transform.matrix[0][1] = result_0.m[1][0];
+        transform.matrix[1][1] = result_0.m[1][1];
+        transform.matrix[2][1] = result_0.m[1][2];
+        transform.matrix[0][2] = result_0.m[2][0];
+        transform.matrix[1][2] = result_0.m[2][1];
+        transform.matrix[2][2] = result_0.m[2][2]
+    } else if input.color_space == GRAY_SIGNATURE {
+        transform.input_gamma_table_gray = build_input_gamma_table(input.grayTRC.as_deref());
+        transform.input_gamma_table_gray.as_ref()?;
+        if precache {
+            if out_type == RGB8 {
+                transform.transform_fn = Some(qcms_transform_data_gray_out_precache)
+            } else if out_type == RGBA8 {
+                if in_type == Gray8 {
+                    transform.transform_fn = Some(qcms_transform_data_gray_rgba_out_precache)
+                } else {
+                    transform.transform_fn = Some(qcms_transform_data_graya_rgba_out_precache)
+                }
+            } else if out_type == BGRA8 {
+                if in_type == Gray8 {
+                    transform.transform_fn = Some(qcms_transform_data_gray_bgra_out_precache)
+                } else {
+                    transform.transform_fn = Some(qcms_transform_data_graya_bgra_out_precache)
+                }
+            }
+        } else if out_type == RGB8 {
+            transform.transform_fn = Some(qcms_transform_data_gray_out_lut)
+        } else if out_type == RGBA8 {
+            if in_type == Gray8 {
+                transform.transform_fn = Some(qcms_transform_data_gray_rgba_out_lut)
+            } else {
+                transform.transform_fn = Some(qcms_transform_data_graya_rgba_out_lut)
+            }
+        } else if out_type == BGRA8 {
+            if in_type == Gray8 {
+                transform.transform_fn = Some(qcms_transform_data_gray_bgra_out_lut)
+            } else {
+                transform.transform_fn = Some(qcms_transform_data_graya_bgra_out_lut)
+            }
+        }
+    } else {
+        debug_assert!(false, "unexpected colorspace");
+        return None;
+    }
+    debug_assert!(transform.transform_fn.is_some());
+    Some(transform)
+}
+/// A transform from an input profile to an output one.
+pub struct Transform {
+    src_ty: DataType,
+    dst_ty: DataType,
+    xfm: Box<qcms_transform>,
+}
+
+impl Transform {
+    /// Create a new transform from `input` to `output` for pixels of `DataType` `ty` with `intent`
+    pub fn new(input: &Profile, output: &Profile, ty: DataType, intent: Intent) -> Option<Self> {
+        transform_create(input, ty, output, ty, intent).map(|xfm| Transform {
+            src_ty: ty,
+            dst_ty: ty,
+            xfm,
+        })
+    }
+
+    /// Create a new transform from `input` to `output` for pixels of `DataType` `ty` with `intent`
+    pub fn new_to(
+        input: &Profile,
+        output: &Profile,
+        src_ty: DataType,
+        dst_ty: DataType,
+        intent: Intent,
+    ) -> Option<Self> {
+        transform_create(input, src_ty, output, dst_ty, intent).map(|xfm| Transform {
+            src_ty,
+            dst_ty,
+            xfm,
+        })
+    }
+
+    /// Apply the color space transform to `data`
+    pub fn apply(&self, data: &mut [u8]) {
+        if data.len() % self.src_ty.bytes_per_pixel() != 0 {
+            panic!(
+                "incomplete pixels: should be a multiple of {} got {}",
+                self.src_ty.bytes_per_pixel(),
+                data.len()
+            )
+        }
+        unsafe {
+            self.xfm.transform_fn.expect("non-null function pointer")(
+                &*self.xfm,
+                data.as_ptr(),
+                data.as_mut_ptr(),
+                data.len() / self.src_ty.bytes_per_pixel(),
+            );
+        }
+    }
+
+    /// Apply the color space transform to `data`
+    pub fn convert(&self, src: &[u8], dst: &mut [u8]) {
+        if src.len() % self.src_ty.bytes_per_pixel() != 0 {
+            panic!(
+                "incomplete pixels: should be a multiple of {} got {}",
+                self.src_ty.bytes_per_pixel(),
+                src.len()
+            )
+        }
+        if dst.len() % self.dst_ty.bytes_per_pixel() != 0 {
+            panic!(
+                "incomplete pixels: should be a multiple of {} got {}",
+                self.dst_ty.bytes_per_pixel(),
+                dst.len()
+            )
+        }
+        assert_eq!(
+            src.len() / self.src_ty.bytes_per_pixel(),
+            dst.len() / self.dst_ty.bytes_per_pixel()
+        );
+        unsafe {
+            self.xfm.transform_fn.expect("non-null function pointer")(
+                &*self.xfm,
+                src.as_ptr(),
+                dst.as_mut_ptr(),
+                src.len() / self.src_ty.bytes_per_pixel(),
+            );
+        }
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn qcms_enable_iccv4() {
+    SUPPORTS_ICCV4.store(true, Ordering::Relaxed);
+}
diff --git a/gfx/qcms/src/transform_avx.rs b/gfx/qcms/src/transform_avx.rs
new file mode 100644
index 0000000000..b34fc869d5
--- /dev/null
+++ b/gfx/qcms/src/transform_avx.rs
@@ -0,0 +1,230 @@
+use crate::transform::{qcms_transform, Format, BGRA, CLAMPMAXVAL, FLOATSCALE, RGB, RGBA};
+#[cfg(target_arch = "x86")]
+pub use std::arch::x86::{
+    __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256,
+    _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps,
+    _mm256_mul_ps, _mm256_set1_ps, _mm256_set_ps, _mm256_setzero_ps, _mm256_store_si256,
+    _mm_add_ps, _mm_broadcast_ss, _mm_cvtps_epi32, _mm_loadu_ps, _mm_max_ps, _mm_min_ps,
+    _mm_mul_ps, _mm_setzero_ps, _mm_store_si128,
+};
+#[cfg(target_arch = "x86_64")]
+pub use std::arch::x86_64::{
+    __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256,
+    _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps,
+    _mm256_mul_ps, _mm256_set1_ps, _mm256_set_ps, _mm256_setzero_ps, _mm256_store_si256,
+    _mm_add_ps, _mm_broadcast_ss, _mm_cvtps_epi32, _mm_loadu_ps, _mm_max_ps, _mm_min_ps,
+    _mm_mul_ps, _mm_setzero_ps, _mm_store_si128,
+};
+
+#[repr(align(32))]
+struct Output([u32; 8]);
+
+#[target_feature(enable = "avx")]
+unsafe extern "C" fn qcms_transform_data_template_lut_avx<F: Format>(
+    transform: &qcms_transform,
+    mut src: *const u8,
+    mut dest: *mut u8,
+    mut length: usize,
+) {
+    let mat: *const [f32; 4] = (*transform).matrix.as_ptr();
+    let mut input: Output = std::mem::zeroed();
+    /* share input and output locations to save having to keep the
+     * locations in separate registers */
+    let output: *const u32 = &mut input as *mut Output as *mut u32;
+    /* deref *transform now to avoid it in loop */
+    let igtbl_r: *const f32 = (*transform).input_gamma_table_r.as_ref().unwrap().as_ptr();
+    let igtbl_g: *const f32 = (*transform).input_gamma_table_g.as_ref().unwrap().as_ptr();
+    let igtbl_b: *const f32 = (*transform).input_gamma_table_b.as_ref().unwrap().as_ptr();
+    /* deref *transform now to avoid it in loop */
+    let otdata_r: *const u8 = (*transform)
+        .output_table_r
+        .as_deref()
+        .unwrap()
+        .data
+        .as_ptr();
+    let otdata_g: *const u8 = (*transform)
+        .output_table_g
+        .as_deref()
+        .unwrap()
+        .data
+        .as_ptr();
+    let otdata_b: *const u8 = (*transform)
+        .output_table_b
+        .as_deref()
+        .unwrap()
+        .data
+        .as_ptr();
+    /* input matrix values never change */
+    let mat0: __m256 = _mm256_broadcast_ps(&*((*mat.offset(0isize)).as_ptr() as *const __m128));
+    let mat1: __m256 = _mm256_broadcast_ps(&*((*mat.offset(1isize)).as_ptr() as *const __m128));
+    let mat2: __m256 = _mm256_broadcast_ps(&*((*mat.offset(2isize)).as_ptr() as *const __m128));
+    /* these values don't change, either */
+    let max: __m256 = _mm256_set1_ps(CLAMPMAXVAL);
+    let min: __m256 = _mm256_setzero_ps();
+    let scale: __m256 = _mm256_set1_ps(FLOATSCALE);
+    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
+    /* working variables */
+    let mut vec_r: __m256 = _mm256_setzero_ps();
+    let mut vec_g: __m256 = _mm256_setzero_ps();
+    let mut vec_b: __m256 = _mm256_setzero_ps();
+    let mut result: __m256;
+    let mut vec_r0: __m128;
+    let mut vec_g0: __m128;
+    let mut vec_b0: __m128;
+    let mut vec_r1: __m128;
+    let mut vec_g1: __m128;
+    let mut vec_b1: __m128;
+    let mut alpha1: u8 = 0;
+    let mut alpha2: u8 = 0;
+    /* CYA */
+    if length == 0 {
+        return;
+    }
+    /* If there are at least 2 pixels, then we can load their components into
+    a single 256-bit register for processing. */
+    if length > 1 {
+        vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
+        vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
+        vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
+        vec_r1 =
+            _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize));
+        vec_g1 =
+            _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize));
+        vec_b1 =
+            _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize));
+        vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
+        vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
+        vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
+        if F::kAIndex != 0xff {
+            alpha1 = *src.add(F::kAIndex);
+            alpha2 = *src.add(F::kAIndex + components as usize)
+        }
+    }
+    /* If there are at least 4 pixels, then we can iterate and preload the
+    next 2 while we store the result of the current 2. */
+    while length > 3 {
+        /* Ensure we are pointing at the next 2 pixels for the next load. */
+        src = src.offset((2 * components) as isize);
+        /* gamma * matrix */
+        vec_r = _mm256_mul_ps(vec_r, mat0);
+        vec_g = _mm256_mul_ps(vec_g, mat1);
+        vec_b = _mm256_mul_ps(vec_b, mat2);
+        /* store alpha for these pixels; load alpha for next two */
+        if F::kAIndex != 0xff {
+            *dest.add(F::kAIndex) = alpha1;
+            *dest.add(F::kAIndex + components as usize) = alpha2;
+            alpha1 = *src.add(F::kAIndex);
+            alpha2 = *src.add(F::kAIndex + components as usize)
+        }
+        /* crunch, crunch, crunch */
+        vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
+        vec_r = _mm256_max_ps(min, vec_r);
+        vec_r = _mm256_min_ps(max, vec_r);
+        result = _mm256_mul_ps(vec_r, scale);
+        /* store calc'd output tables indices */
+        _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result));
+        /* load gamma values for next loop while store completes */
+        vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
+        vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
+        vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
+        vec_r1 =
+            _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize));
+        vec_g1 =
+            _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize));
+        vec_b1 =
+            _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize));
+        vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
+        vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
+        vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
+        /* use calc'd indices to output RGB values */
+        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
+        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
+        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
+        *dest.add(F::kRIndex + components as usize) =
+            *otdata_r.offset(*output.offset(4isize) as isize);
+        *dest.add(F::kGIndex + components as usize) =
+            *otdata_g.offset(*output.offset(5isize) as isize);
+        *dest.add(F::kBIndex + components as usize) =
+            *otdata_b.offset(*output.offset(6isize) as isize);
+        dest = dest.offset((2 * components) as isize);
+        length -= 2
+    }
+    /* There are 0-3 pixels remaining. If there are 2-3 remaining, then we know
+    we have already populated the necessary registers to start the transform. */
+    if length > 1 {
+        vec_r = _mm256_mul_ps(vec_r, mat0);
+        vec_g = _mm256_mul_ps(vec_g, mat1);
+        vec_b = _mm256_mul_ps(vec_b, mat2);
+        if F::kAIndex != 0xff {
+            *dest.add(F::kAIndex) = alpha1;
+            *dest.add(F::kAIndex + components as usize) = alpha2
+        }
+        vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
+        vec_r = _mm256_max_ps(min, vec_r);
+        vec_r = _mm256_min_ps(max, vec_r);
+        result = _mm256_mul_ps(vec_r, scale);
+        _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result));
+        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
+        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
+        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
+        *dest.add(F::kRIndex + components as usize) =
+            *otdata_r.offset(*output.offset(4isize) as isize);
+        *dest.add(F::kGIndex + components as usize) =
+            *otdata_g.offset(*output.offset(5isize) as isize);
+        *dest.add(F::kBIndex + components as usize) =
+            *otdata_b.offset(*output.offset(6isize) as isize);
+        src = src.offset((2 * components) as isize);
+        dest = dest.offset((2 * components) as isize);
+        length -= 2
+    }
+    /* There may be 0-1 pixels remaining. */
+    if length == 1 {
+        vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
+        vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
+        vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
+        vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(mat0));
+        vec_g0 = _mm_mul_ps(vec_g0, _mm256_castps256_ps128(mat1));
+        vec_b0 = _mm_mul_ps(vec_b0, _mm256_castps256_ps128(mat2));
+        if F::kAIndex != 0xff {
+            *dest.add(F::kAIndex) = *src.add(F::kAIndex)
+        }
+        vec_r0 = _mm_add_ps(vec_r0, _mm_add_ps(vec_g0, vec_b0));
+        vec_r0 = _mm_max_ps(_mm256_castps256_ps128(min), vec_r0);
+        vec_r0 = _mm_min_ps(_mm256_castps256_ps128(max), vec_r0);
+        vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(scale));
+        _mm_store_si128(output as *mut __m128i, _mm_cvtps_epi32(vec_r0));
+        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
+        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
+        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize)
+    };
+}
+#[no_mangle]
+#[target_feature(enable = "avx")]
+pub unsafe fn qcms_transform_data_rgb_out_lut_avx(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_avx::<RGB>(transform, src, dest, length);
+}
+#[no_mangle]
+#[target_feature(enable = "avx")]
+pub unsafe fn qcms_transform_data_rgba_out_lut_avx(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_avx::<RGBA>(transform, src, dest, length);
+}
+#[no_mangle]
+#[target_feature(enable = "avx")]
+pub unsafe fn qcms_transform_data_bgra_out_lut_avx(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_avx::<BGRA>(transform, src, dest, length);
+}
diff --git a/gfx/qcms/src/transform_neon.rs b/gfx/qcms/src/transform_neon.rs
new file mode 100644
index 0000000000..d3983ba18c
--- /dev/null
+++ b/gfx/qcms/src/transform_neon.rs
@@ -0,0 +1,158 @@
+use crate::transform::{qcms_transform, Format, BGRA, CLAMPMAXVAL, FLOATSCALE, RGB, RGBA};
+#[cfg(target_arch = "aarch64")]
+use core::arch::aarch64::{
+    float32x4_t, int32x4_t, vaddq_f32, vcvtq_s32_f32, vgetq_lane_s32, vld1q_dup_f32, vld1q_f32,
+    vmaxq_f32, vminq_f32, vmulq_f32,
+};
+#[cfg(target_arch = "arm")]
+use core::arch::arm::{
+    float32x4_t, int32x4_t, vaddq_f32, vcvtq_s32_f32, vgetq_lane_s32, vld1q_dup_f32, vld1q_f32,
+    vmaxq_f32, vminq_f32, vmulq_f32,
+};
+use std::mem::zeroed;
+
+static mut floatScale: f32 = FLOATSCALE;
+static mut clampMaxValue: f32 = CLAMPMAXVAL;
+
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+unsafe fn qcms_transform_data_template_lut_neon<F: Format>(
+    transform: &qcms_transform,
+    mut src: *const u8,
+    mut dest: *mut u8,
+    mut length: usize,
+) {
+    let mat: *const [f32; 4] = (*transform).matrix.as_ptr();
+    /* deref *transform now to avoid it in loop */
+    let igtbl_r: *const f32 = (*transform).input_gamma_table_r.as_ref().unwrap().as_ptr();
+    let igtbl_g: *const f32 = (*transform).input_gamma_table_g.as_ref().unwrap().as_ptr();
+    let igtbl_b: *const f32 = (*transform).input_gamma_table_b.as_ref().unwrap().as_ptr();
+    /* deref *transform now to avoid it in loop */
+    let otdata_r: *const u8 = (*transform)
+        .output_table_r
+        .as_deref()
+        .unwrap()
+        .data
+        .as_ptr();
+    let otdata_g: *const u8 = (*transform)
+        .output_table_g
+        .as_deref()
+        .unwrap()
+        .data
+        .as_ptr();
+    let otdata_b: *const u8 = (*transform)
+        .output_table_b
+        .as_deref()
+        .unwrap()
+        .data
+        .as_ptr();
+    /* input matrix values never change */
+    let mat0: float32x4_t = vld1q_f32((*mat.offset(0isize)).as_ptr());
+    let mat1: float32x4_t = vld1q_f32((*mat.offset(1isize)).as_ptr());
+    let mat2: float32x4_t = vld1q_f32((*mat.offset(2isize)).as_ptr());
+    /* these values don't change, either */
+    let max: float32x4_t = vld1q_dup_f32(&clampMaxValue);
+    let min: float32x4_t = zeroed();
+    let scale: float32x4_t = vld1q_dup_f32(&floatScale);
+    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
+    /* working variables */
+    let mut vec_r: float32x4_t;
+    let mut vec_g: float32x4_t;
+    let mut vec_b: float32x4_t;
+    let mut result: int32x4_t;
+    let mut alpha: u8 = 0;
+    /* CYA */
+    if length == 0 {
+        return;
+    }
+    /* one pixel is handled outside of the loop */
+    length = length.wrapping_sub(1);
+    /* setup for transforming 1st pixel */
+    vec_r = vld1q_dup_f32(&*igtbl_r.offset(*src.offset(F::kRIndex as isize) as isize));
+    vec_g = vld1q_dup_f32(&*igtbl_g.offset(*src.offset(F::kGIndex as isize) as isize));
+    vec_b = vld1q_dup_f32(&*igtbl_b.offset(*src.offset(F::kBIndex as isize) as isize));
+    if F::kAIndex != 0xff {
+        alpha = *src.offset(F::kAIndex as isize)
+    }
+    src = src.offset(components as isize);
+    let mut i: u32 = 0;
+    while (i as usize) < length {
+        /* gamma * matrix */
+        vec_r = vmulq_f32(vec_r, mat0);
+        vec_g = vmulq_f32(vec_g, mat1);
+        vec_b = vmulq_f32(vec_b, mat2);
+        /* store alpha for this pixel; load alpha for next */
+        if F::kAIndex != 0xff {
+            *dest.offset(F::kAIndex as isize) = alpha;
+            alpha = *src.offset(F::kAIndex as isize)
+        }
+        /* crunch, crunch, crunch */
+        vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b));
+        vec_r = vmaxq_f32(min, vec_r);
+        vec_r = vminq_f32(max, vec_r);
+        result = vcvtq_s32_f32(vmulq_f32(vec_r, scale));
+
+        /* use calc'd indices to output RGB values */
+        *dest.offset(F::kRIndex as isize) = *otdata_r.offset(vgetq_lane_s32(result, 0) as isize);
+        *dest.offset(F::kGIndex as isize) = *otdata_g.offset(vgetq_lane_s32(result, 1) as isize);
+        *dest.offset(F::kBIndex as isize) = *otdata_b.offset(vgetq_lane_s32(result, 2) as isize);
+
+        /* load gamma values for next loop while store completes */
+        vec_r = vld1q_dup_f32(&*igtbl_r.offset(*src.offset(F::kRIndex as isize) as isize));
+        vec_g = vld1q_dup_f32(&*igtbl_g.offset(*src.offset(F::kGIndex as isize) as isize));
+        vec_b = vld1q_dup_f32(&*igtbl_b.offset(*src.offset(F::kBIndex as isize) as isize));
+
+        dest = dest.offset(components as isize);
+        src = src.offset(components as isize);
+        i = i.wrapping_add(1)
+    }
+    /* handle final (maybe only) pixel */
+    vec_r = vmulq_f32(vec_r, mat0);
+    vec_g = vmulq_f32(vec_g, mat1);
+    vec_b = vmulq_f32(vec_b, mat2);
+    if F::kAIndex != 0xff {
+        *dest.offset(F::kAIndex as isize) = alpha
+    }
+    vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b));
+    vec_r = vmaxq_f32(min, vec_r);
+    vec_r = vminq_f32(max, vec_r);
+    result = vcvtq_s32_f32(vmulq_f32(vec_r, scale));
+
+    *dest.offset(F::kRIndex as isize) = *otdata_r.offset(vgetq_lane_s32(result, 0) as isize);
+    *dest.offset(F::kGIndex as isize) = *otdata_g.offset(vgetq_lane_s32(result, 1) as isize);
+    *dest.offset(F::kBIndex as isize) = *otdata_b.offset(vgetq_lane_s32(result, 2) as isize);
+}
+#[no_mangle]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+pub unsafe fn qcms_transform_data_rgb_out_lut_neon(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_neon::<RGB>(transform, src, dest, length);
+}
+#[no_mangle]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+pub unsafe fn qcms_transform_data_rgba_out_lut_neon(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_neon::<RGBA>(transform, src, dest, length);
+}
+
+#[no_mangle]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+pub unsafe fn qcms_transform_data_bgra_out_lut_neon(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_neon::<BGRA>(transform, src, dest, length);
+}
diff --git a/gfx/qcms/src/transform_sse2.rs b/gfx/qcms/src/transform_sse2.rs
new file mode 100644
index 0000000000..f6bccaadc3
--- /dev/null
+++ b/gfx/qcms/src/transform_sse2.rs
@@ -0,0 +1,159 @@
+use crate::transform::{qcms_transform, Format, BGRA, CLAMPMAXVAL, FLOATSCALE, RGB, RGBA};
+#[cfg(target_arch = "x86")]
+pub use std::arch::x86::{
+    __m128, __m128i, _mm_add_ps, _mm_cvtps_epi32, _mm_load_ps, _mm_load_ss, _mm_max_ps, _mm_min_ps,
+    _mm_mul_ps, _mm_set1_ps, _mm_setzero_ps, _mm_shuffle_ps, _mm_store_si128,
+};
+#[cfg(target_arch = "x86_64")]
+pub use std::arch::x86_64::{
+    __m128, __m128i, _mm_add_ps, _mm_cvtps_epi32, _mm_load_ps, _mm_load_ss, _mm_max_ps, _mm_min_ps,
+    _mm_mul_ps, _mm_set1_ps, _mm_setzero_ps, _mm_shuffle_ps, _mm_store_si128,
+};
+
+#[repr(align(16))]
+struct Output([u32; 4]);
+
+unsafe extern "C" fn qcms_transform_data_template_lut_sse2<F: Format>(
+    transform: &qcms_transform,
+    mut src: *const u8,
+    mut dest: *mut u8,
+    mut length: usize,
+) {
+    let mat: *const [f32; 4] = (*transform).matrix.as_ptr();
+    let mut input: Output = std::mem::zeroed();
+    /* share input and output locations to save having to keep the
+     * locations in separate registers */
+    let output: *const u32 = &mut input as *mut Output as *mut u32;
+    /* deref *transform now to avoid it in loop */
+    let igtbl_r: *const f32 = (*transform).input_gamma_table_r.as_ref().unwrap().as_ptr();
+    let igtbl_g: *const f32 = (*transform).input_gamma_table_g.as_ref().unwrap().as_ptr();
+    let igtbl_b: *const f32 = (*transform).input_gamma_table_b.as_ref().unwrap().as_ptr();
+    /* deref *transform now to avoid it in loop */
+    let otdata_r: *const u8 = (*transform)
+        .output_table_r
+        .as_deref()
+        .unwrap()
+        .data
+        .as_ptr();
+    let otdata_g: *const u8 = (*transform)
+        .output_table_g
+        .as_deref()
+        .unwrap()
+        .data
+        .as_ptr();
+    let otdata_b: *const u8 = (*transform)
+        .output_table_b
+        .as_deref()
+        .unwrap()
+        .data
+        .as_ptr();
+    /* input matrix values never change */
+    let mat0: __m128 = _mm_load_ps((*mat.offset(0isize)).as_ptr());
+    let mat1: __m128 = _mm_load_ps((*mat.offset(1isize)).as_ptr());
+    let mat2: __m128 = _mm_load_ps((*mat.offset(2isize)).as_ptr());
+    /* these values don't change, either */
+    let max: __m128 = _mm_set1_ps(CLAMPMAXVAL);
+    let min: __m128 = _mm_setzero_ps();
+    let scale: __m128 = _mm_set1_ps(FLOATSCALE);
+    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
+    /* working variables */
+    let mut vec_r: __m128;
+    let mut vec_g: __m128;
+    let mut vec_b: __m128;
+    let mut result: __m128;
+    let mut alpha: u8 = 0;
+    /* CYA */
+    if length == 0 {
+        return;
+    }
+    /* one pixel is handled outside of the loop */
+    length -= 1;
+    /* setup for transforming 1st pixel */
+    vec_r = _mm_load_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
+    vec_g = _mm_load_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
+    vec_b = _mm_load_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
+    if F::kAIndex != 0xff {
+        alpha = *src.add(F::kAIndex)
+    }
+    src = src.offset(components as isize);
+    let mut i: u32 = 0;
+    while (i as usize) < length {
+        /* position values from gamma tables */
+        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+        /* gamma * matrix */
+        vec_r = _mm_mul_ps(vec_r, mat0);
+        vec_g = _mm_mul_ps(vec_g, mat1);
+        vec_b = _mm_mul_ps(vec_b, mat2);
+        /* store alpha for this pixel; load alpha for next */
+        if F::kAIndex != 0xff {
+            *dest.add(F::kAIndex) = alpha;
+            alpha = *src.add(F::kAIndex)
+        }
+        /* crunch, crunch, crunch */
+        vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+        vec_r = _mm_max_ps(min, vec_r);
+        vec_r = _mm_min_ps(max, vec_r);
+        result = _mm_mul_ps(vec_r, scale);
+        /* store calc'd output tables indices */
+        _mm_store_si128(output as *mut __m128i, _mm_cvtps_epi32(result));
+        /* load gamma values for next loop while store completes */
+        vec_r = _mm_load_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
+        vec_g = _mm_load_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
+        vec_b = _mm_load_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
+        src = src.offset(components as isize);
+        /* use calc'd indices to output RGB values */
+        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
+        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
+        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
+        dest = dest.offset(components as isize);
+        i += 1
+    }
+    /* handle final (maybe only) pixel */
+    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+    vec_r = _mm_mul_ps(vec_r, mat0);
+    vec_g = _mm_mul_ps(vec_g, mat1);
+    vec_b = _mm_mul_ps(vec_b, mat2);
+    if F::kAIndex != 0xff {
+        *dest.add(F::kAIndex) = alpha
+    }
+    vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+    vec_r = _mm_max_ps(min, vec_r);
+    vec_r = _mm_min_ps(max, vec_r);
+    result = _mm_mul_ps(vec_r, scale);
+    _mm_store_si128(output as *mut __m128i, _mm_cvtps_epi32(result));
+    *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
+    *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
+    *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
+}
+#[no_mangle]
+pub unsafe fn qcms_transform_data_rgb_out_lut_sse2(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_sse2::<RGB>(transform, src, dest, length);
+}
+#[no_mangle]
+pub unsafe fn qcms_transform_data_rgba_out_lut_sse2(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_sse2::<RGBA>(transform, src, dest, length);
+}
+
+#[no_mangle]
+pub unsafe fn qcms_transform_data_bgra_out_lut_sse2(
+    transform: &qcms_transform,
+    src: *const u8,
+    dest: *mut u8,
+    length: usize,
+) {
+    qcms_transform_data_template_lut_sse2::<BGRA>(transform, src, dest, length);
+}
diff --git a/gfx/qcms/src/transform_util.rs b/gfx/qcms/src/transform_util.rs
new file mode 100644
index 0000000000..75fd2ca0e2
--- /dev/null
+++ b/gfx/qcms/src/transform_util.rs
@@ -0,0 +1,608 @@
+//  qcms
+//  Copyright (C) 2009 Mozilla Foundation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+use std::convert::TryInto;
+
+use crate::{
+    iccread::{curveType, Profile},
+    s15Fixed16Number_to_float,
+};
+use crate::{matrix::Matrix, transform::PRECACHE_OUTPUT_MAX, transform::PRECACHE_OUTPUT_SIZE};
+
+//XXX: could use a bettername
+pub type uint16_fract_t = u16;
+
+#[inline]
+fn u8Fixed8Number_to_float(x: u16) -> f32 {
+    // 0x0000 = 0.
+    // 0x0100 = 1.
+    // 0xffff = 255  + 255/256
+    (x as i32 as f64 / 256.0f64) as f32
+}
+#[inline]
+pub fn clamp_float(a: f32) -> f32 {
+    /* One would naturally write this function as the following:
+    if (a > 1.)
+      return 1.;
+    else if (a < 0)
+      return 0;
+    else
+      return a;
+
+    However, that version will let NaNs pass through which is undesirable
+    for most consumers.
+    */
+    if a > 1. {
+        1.
+    } else if a >= 0. {
+        a
+    } else {
+        // a < 0 or a is NaN
+        0.
+    }
+}
+/* value must be a value between 0 and 1 */
+//XXX: is the above a good restriction to have?
+// the output range of this functions is 0..1
+pub fn lut_interp_linear(mut input_value: f64, table: &[u16]) -> f32 {
+    input_value *= (table.len() - 1) as f64;
+
+    let upper: i32 = input_value.ceil() as i32;
+    let lower: i32 = input_value.floor() as i32;
+    let value: f32 = ((table[upper as usize] as f64) * (1. - (upper as f64 - input_value))
+        + (table[lower as usize] as f64 * (upper as f64 - input_value)))
+        as f32;
+    /* scale the value */
+    value * (1.0 / 65535.0)
+}
+/* same as above but takes and returns a uint16_t value representing a range from 0..1 */
+#[no_mangle]
+pub fn lut_interp_linear16(input_value: u16, table: &[u16]) -> u16 {
+    /* Start scaling input_value to the length of the array: 65535*(length-1).
+     * We'll divide out the 65535 next */
+    let mut value: u32 = (input_value as i32 * (table.len() as i32 - 1)) as u32; /* equivalent to ceil(value/65535) */
+    let upper: u32 = (value + 65534) / 65535; /* equivalent to floor(value/65535) */
+    let lower: u32 = value / 65535;
+    /* interp is the distance from upper to value scaled to 0..65535 */
+    let interp: u32 = value % 65535; // 0..65535*65535
+    value = (table[upper as usize] as u32 * interp
+        + table[lower as usize] as u32 * (65535 - interp))
+        / 65535;
+    value as u16
+}
+/* same as above but takes an input_value from 0..PRECACHE_OUTPUT_MAX
+ * and returns a uint8_t value representing a range from 0..1 */
+fn lut_interp_linear_precache_output(input_value: u32, table: &[u16]) -> u8 {
+    /* Start scaling input_value to the length of the array: PRECACHE_OUTPUT_MAX*(length-1).
+     * We'll divide out the PRECACHE_OUTPUT_MAX next */
+    let mut value: u32 = input_value * (table.len() - 1) as u32;
+    /* equivalent to ceil(value/PRECACHE_OUTPUT_MAX) */
+    let upper: u32 = (value + PRECACHE_OUTPUT_MAX as u32 - 1) / PRECACHE_OUTPUT_MAX as u32;
+    /* equivalent to floor(value/PRECACHE_OUTPUT_MAX) */
+    let lower: u32 = value / PRECACHE_OUTPUT_MAX as u32;
+    /* interp is the distance from upper to value scaled to 0..PRECACHE_OUTPUT_MAX */
+    let interp: u32 = value % PRECACHE_OUTPUT_MAX as u32;
+    /* the table values range from 0..65535 */
+    value = table[upper as usize] as u32 * interp
+        + table[lower as usize] as u32 * (PRECACHE_OUTPUT_MAX as u32 - interp); // 0..(65535*PRECACHE_OUTPUT_MAX)
+                                                                                /* round and scale */
+    value += (PRECACHE_OUTPUT_MAX * 65535 / 255 / 2) as u32; // scale to 0..255
+    value /= (PRECACHE_OUTPUT_MAX * 65535 / 255) as u32;
+    value as u8
+}
+/* value must be a value between 0 and 1 */
+//XXX: is the above a good restriction to have?
+pub fn lut_interp_linear_float(mut value: f32, table: &[f32]) -> f32 {
+    value *= (table.len() - 1) as f32;
+
+    let upper: i32 = value.ceil() as i32;
+    let lower: i32 = value.floor() as i32;
+    //XXX: can we be more performant here?
+    value = (table[upper as usize] as f64 * (1.0f64 - (upper as f32 - value) as f64)
+        + (table[lower as usize] * (upper as f32 - value)) as f64) as f32;
+    /* scale the value */
+    value
+}
+fn compute_curve_gamma_table_type1(gamma: u16) -> Box<[f32; 256]> {
+    let mut gamma_table = Vec::with_capacity(256);
+    let gamma_float: f32 = u8Fixed8Number_to_float(gamma);
+    for i in 0..256 {
+        // 0..1^(0..255 + 255/256) will always be between 0 and 1
+        gamma_table.push((i as f64 / 255.0f64).powf(gamma_float as f64) as f32);
+    }
+    gamma_table.into_boxed_slice().try_into().unwrap()
+}
+fn compute_curve_gamma_table_type2(table: &[u16]) -> Box<[f32; 256]> {
+    let mut gamma_table = Vec::with_capacity(256);
+    for i in 0..256 {
+        gamma_table.push(lut_interp_linear(i as f64 / 255.0f64, table));
+    }
+    gamma_table.into_boxed_slice().try_into().unwrap()
+}
+fn compute_curve_gamma_table_type_parametric(params: &[f32]) -> Box<[f32; 256]> {
+    let params = Param::new(params);
+    let mut gamma_table = Vec::with_capacity(256);
+    for i in 0..256 {
+        let X = i as f32 / 255.;
+        gamma_table.push(clamp_float(params.eval(X)));
+    }
+    gamma_table.into_boxed_slice().try_into().unwrap()
+}
+
+fn compute_curve_gamma_table_type0() -> Box<[f32; 256]> {
+    let mut gamma_table = Vec::with_capacity(256);
+    for i in 0..256 {
+        gamma_table.push((i as f64 / 255.0f64) as f32);
+    }
+    gamma_table.into_boxed_slice().try_into().unwrap()
+}
+pub(crate) fn build_input_gamma_table(TRC: Option<&curveType>) -> Option<Box<[f32; 256]>> {
+    let TRC = match TRC {
+        Some(TRC) => TRC,
+        None => return None,
+    };
+    Some(match TRC {
+        curveType::Parametric(params) => compute_curve_gamma_table_type_parametric(params),
+        curveType::Curve(data) => match data.len() {
+            0 => compute_curve_gamma_table_type0(),
+            1 => compute_curve_gamma_table_type1(data[0]),
+            _ => compute_curve_gamma_table_type2(data),
+        },
+    })
+}
+pub fn build_colorant_matrix(p: &Profile) -> Matrix {
+    let mut result: Matrix = Matrix { m: [[0.; 3]; 3] };
+    result.m[0][0] = s15Fixed16Number_to_float(p.redColorant.X);
+    result.m[0][1] = s15Fixed16Number_to_float(p.greenColorant.X);
+    result.m[0][2] = s15Fixed16Number_to_float(p.blueColorant.X);
+    result.m[1][0] = s15Fixed16Number_to_float(p.redColorant.Y);
+    result.m[1][1] = s15Fixed16Number_to_float(p.greenColorant.Y);
+    result.m[1][2] = s15Fixed16Number_to_float(p.blueColorant.Y);
+    result.m[2][0] = s15Fixed16Number_to_float(p.redColorant.Z);
+    result.m[2][1] = s15Fixed16Number_to_float(p.greenColorant.Z);
+    result.m[2][2] = s15Fixed16Number_to_float(p.blueColorant.Z);
+    result
+}
+
+/** Parametric representation of transfer function */
+#[derive(Debug)]
+struct Param {
+    g: f32,
+    a: f32,
+    b: f32,
+    c: f32,
+    d: f32,
+    e: f32,
+    f: f32,
+}
+
+impl Param {
+    #[allow(clippy::many_single_char_names)]
+    fn new(params: &[f32]) -> Param {
+        // convert from the variable number of parameters
+        // contained in profiles to a unified representation.
+        let g: f32 = params[0];
+        match params[1..] {
+            [] => Param {
+                g,
+                a: 1.,
+                b: 0.,
+                c: 1.,
+                d: 0.,
+                e: 0.,
+                f: 0.,
+            },
+            [a, b] => Param {
+                g,
+                a,
+                b,
+                c: 0.,
+                d: -b / a,
+                e: 0.,
+                f: 0.,
+            },
+            [a, b, c] => Param {
+                g,
+                a,
+                b,
+                c: 0.,
+                d: -b / a,
+                e: c,
+                f: c,
+            },
+            [a, b, c, d] => Param {
+                g,
+                a,
+                b,
+                c,
+                d,
+                e: 0.,
+                f: 0.,
+            },
+            [a, b, c, d, e, f] => Param {
+                g,
+                a,
+                b,
+                c,
+                d,
+                e,
+                f,
+            },
+            _ => panic!(),
+        }
+    }
+
+    fn eval(&self, x: f32) -> f32 {
+        if x < self.d {
+            self.c * x + self.f
+        } else {
+            (self.a * x + self.b).powf(self.g) + self.e
+        }
+    }
+    #[allow(clippy::many_single_char_names)]
+    fn invert(&self) -> Option<Param> {
+        // First check if the function is continuous at the cross-over point d.
+        let d1 = (self.a * self.d + self.b).powf(self.g) + self.e;
+        let d2 = self.c * self.d + self.f;
+
+        if (d1 - d2).abs() > 0.1 {
+            return None;
+        }
+        let d = d1;
+
+        // y = (a * x + b)^g + e
+        // y - e = (a * x + b)^g
+        // (y - e)^(1/g) = a*x + b
+        // (y - e)^(1/g) - b = a*x
+        // (y - e)^(1/g)/a - b/a = x
+        // ((y - e)/a^g)^(1/g) - b/a = x
+        // ((1/(a^g)) * y - e/(a^g))^(1/g) - b/a = x
+        let a = 1. / self.a.powf(self.g);
+        let b = -self.e / self.a.powf(self.g);
+        let g = 1. / self.g;
+        let e = -self.b / self.a;
+
+        // y = c * x + f
+        // y - f = c * x
+        // y/c - f/c = x
+        let (c, f);
+        if d <= 0. {
+            c = 1.;
+            f = 0.;
+        } else {
+            c = 1. / self.c;
+            f = -self.f / self.c;
+        }
+
+        // if self.d > 0. and self.c == 0 as is likely with type 1 and 2 parametric function
+        // then c and f will not be finite.
+        if !(g.is_finite()
+            && a.is_finite()
+            && b.is_finite()
+            && c.is_finite()
+            && d.is_finite()
+            && e.is_finite()
+            && f.is_finite())
+        {
+            return None;
+        }
+
+        Some(Param {
+            g,
+            a,
+            b,
+            c,
+            d,
+            e,
+            f,
+        })
+    }
+}
+
+#[test]
+fn param_invert() {
+    let p3 = Param::new(&[2.4, 0.948, 0.052, 0.077, 0.04]);
+    p3.invert().unwrap();
+    let g2_2 = Param::new(&[2.2]);
+    g2_2.invert().unwrap();
+    let g2_2 = Param::new(&[2.2, 0.9, 0.052]);
+    g2_2.invert().unwrap();
+    let g2_2 = dbg!(Param::new(&[2.2, 0.9, -0.52]));
+    g2_2.invert().unwrap();
+    let g2_2 = dbg!(Param::new(&[2.2, 0.9, -0.52, 0.1]));
+    assert!(g2_2.invert().is_none());
+}
+
+/* The following code is copied nearly directly from lcms.
+ * I think it could be much better. For example, Argyll seems to have better code in
+ * icmTable_lookup_bwd and icmTable_setup_bwd. However, for now this is a quick way
+ * to a working solution and allows for easy comparing with lcms. */
+#[no_mangle]
+#[allow(clippy::many_single_char_names)]
+pub fn lut_inverse_interp16(Value: u16, LutTable: &[u16]) -> uint16_fract_t {
+    let mut l: i32 = 1; // 'int' Give spacing for negative values
+    let mut r: i32 = 0x10000;
+    let mut x: i32 = 0;
+    let mut res: i32;
+    let length = LutTable.len() as i32;
+
+    let mut NumZeroes: i32 = 0;
+    while LutTable[NumZeroes as usize] as i32 == 0 && NumZeroes < length - 1 {
+        NumZeroes += 1
+    }
+    // There are no zeros at the beginning and we are trying to find a zero, so
+    // return anything. It seems zero would be the less destructive choice
+    /* I'm not sure that this makes sense, but oh well... */
+    if NumZeroes == 0 && Value as i32 == 0 {
+        return 0u16;
+    }
+    let mut NumPoles: i32 = 0;
+    while LutTable[(length - 1 - NumPoles) as usize] as i32 == 0xffff && NumPoles < length - 1 {
+        NumPoles += 1
+    }
+    // Does the curve belong to this case?
+    if NumZeroes > 1 || NumPoles > 1 {
+        let a_0: i32;
+        let b_0: i32;
+        // Identify if value fall downto 0 or FFFF zone
+        if Value as i32 == 0 {
+            return 0u16;
+        }
+        // if (Value == 0xFFFF) return 0xFFFF;
+        // else restrict to valid zone
+        if NumZeroes > 1 {
+            a_0 = (NumZeroes - 1) * 0xffff / (length - 1);
+            l = a_0 - 1
+        }
+        if NumPoles > 1 {
+            b_0 = (length - 1 - NumPoles) * 0xffff / (length - 1);
+            r = b_0 + 1
+        }
+    }
+    if r <= l {
+        // If this happens LutTable is not invertible
+        return 0u16;
+    }
+    // Seems not a degenerated case... apply binary search
+    while r > l {
+        x = (l + r) / 2;
+        res = lut_interp_linear16((x - 1) as uint16_fract_t, LutTable) as i32;
+        if res == Value as i32 {
+            // Found exact match.
+            return (x - 1) as uint16_fract_t;
+        }
+        if res > Value as i32 {
+            r = x - 1
+        } else {
+            l = x + 1
+        }
+    }
+
+    // Not found, should we interpolate?
+
+    // Get surrounding nodes
+    debug_assert!(x >= 1);
+
+    let val2: f64 = (length - 1) as f64 * ((x - 1) as f64 / 65535.0f64);
+    let cell0: i32 = val2.floor() as i32;
+    let cell1: i32 = val2.ceil() as i32;
+    if cell0 == cell1 {
+        return x as uint16_fract_t;
+    }
+
+    let y0: f64 = LutTable[cell0 as usize] as f64;
+    let x0: f64 = 65535.0f64 * cell0 as f64 / (length - 1) as f64;
+    let y1: f64 = LutTable[cell1 as usize] as f64;
+    let x1: f64 = 65535.0f64 * cell1 as f64 / (length - 1) as f64;
+    let a: f64 = (y1 - y0) / (x1 - x0);
+    let b: f64 = y0 - a * x0;
+    if a.abs() < 0.01f64 {
+        return x as uint16_fract_t;
+    }
+    let f: f64 = (Value as i32 as f64 - b) / a;
+    if f < 0.0f64 {
+        return 0u16;
+    }
+    if f >= 65535.0f64 {
+        return 0xffffu16;
+    }
+    (f + 0.5f64).floor() as uint16_fract_t
+}
+/*
+The number of entries needed to invert a lookup table should not
+necessarily be the same as the original number of entries.  This is
+especially true of lookup tables that have a small number of entries.
+
+For example:
+Using a table like:
+   {0, 3104, 14263, 34802, 65535}
+invert_lut will produce an inverse of:
+   {3, 34459, 47529, 56801, 65535}
+which has an maximum error of about 9855 (pixel difference of ~38.346)
+
+For now, we punt the decision of output size to the caller. */
+fn invert_lut(table: &[u16], out_length: usize) -> Vec<u16> {
+    /* for now we invert the lut by creating a lut of size out_length
+     * and attempting to lookup a value for each entry using lut_inverse_interp16 */
+    let mut output = Vec::with_capacity(out_length);
+    for i in 0..out_length {
+        let x: f64 = i as f64 * 65535.0f64 / (out_length - 1) as f64;
+        let input: uint16_fract_t = (x + 0.5f64).floor() as uint16_fract_t;
+        output.push(lut_inverse_interp16(input, table));
+    }
+    output
+}
+#[allow(clippy::needless_range_loop)]
+fn compute_precache_pow(output: &mut [u8; PRECACHE_OUTPUT_SIZE], gamma: f32) {
+    for v in 0..PRECACHE_OUTPUT_SIZE {
+        //XXX: don't do integer/float conversion... and round?
+        output[v] = (255. * (v as f32 / PRECACHE_OUTPUT_MAX as f32).powf(gamma)) as u8;
+    }
+}
+#[allow(clippy::needless_range_loop)]
+pub fn compute_precache_lut(output: &mut [u8; PRECACHE_OUTPUT_SIZE], table: &[u16]) {
+    for v in 0..PRECACHE_OUTPUT_SIZE {
+        output[v] = lut_interp_linear_precache_output(v as u32, table);
+    }
+}
+#[allow(clippy::needless_range_loop)]
+pub fn compute_precache_linear(output: &mut [u8; PRECACHE_OUTPUT_SIZE]) {
+    for v in 0..PRECACHE_OUTPUT_SIZE {
+        //XXX: round?
+        output[v] = (v / (PRECACHE_OUTPUT_SIZE / 256)) as u8;
+    }
+}
+pub(crate) fn compute_precache(trc: &curveType, output: &mut [u8; PRECACHE_OUTPUT_SIZE]) -> bool {
+    match trc {
+        curveType::Parametric(params) => {
+            let mut gamma_table_uint: [u16; 256] = [0; 256];
+
+            let mut inverted_size: usize = 256;
+            let gamma_table = compute_curve_gamma_table_type_parametric(params);
+            let mut i: u16 = 0u16;
+            while (i as i32) < 256 {
+                gamma_table_uint[i as usize] = (gamma_table[i as usize] * 65535f32) as u16;
+                i += 1
+            }
+            //XXX: the choice of a minimum of 256 here is not backed by any theory,
+            //     measurement or data, however it is what lcms uses.
+            //     the maximum number we would need is 65535 because that's the
+            //     accuracy used for computing the pre cache table
+            if inverted_size < 256 {
+                inverted_size = 256
+            }
+            let inverted = invert_lut(&gamma_table_uint, inverted_size);
+            compute_precache_lut(output, &inverted);
+        }
+        curveType::Curve(data) => {
+            match data.len() {
+                0 => compute_precache_linear(output),
+                1 => compute_precache_pow(output, 1. / u8Fixed8Number_to_float(data[0])),
+                _ => {
+                    let mut inverted_size = data.len();
+                    //XXX: the choice of a minimum of 256 here is not backed by any theory,
+                    //     measurement or data, however it is what lcms uses.
+                    //     the maximum number we would need is 65535 because that's the
+                    //     accuracy used for computing the pre cache table
+                    if inverted_size < 256 {
+                        inverted_size = 256
+                    } //XXX turn this conversion into a function
+                    let inverted = invert_lut(data, inverted_size);
+                    compute_precache_lut(output, &inverted);
+                }
+            }
+        }
+    }
+    true
+}
+fn build_linear_table(length: usize) -> Vec<u16> {
+    let mut output = Vec::with_capacity(length);
+    for i in 0..length {
+        let x: f64 = i as f64 * 65535.0f64 / (length - 1) as f64;
+        let input: uint16_fract_t = (x + 0.5f64).floor() as uint16_fract_t;
+        output.push(input);
+    }
+    output
+}
+fn build_pow_table(gamma: f32, length: usize) -> Vec<u16> {
+    let mut output = Vec::with_capacity(length);
+    for i in 0..length {
+        let mut x: f64 = i as f64 / (length - 1) as f64;
+        x = x.powf(gamma as f64);
+        let result: uint16_fract_t = (x * 65535.0f64 + 0.5f64).floor() as uint16_fract_t;
+        output.push(result);
+    }
+    output
+}
+
+fn to_lut(params: &Param, len: usize) -> Vec<u16> {
+    let mut output = Vec::with_capacity(len);
+    for i in 0..len {
+        let X = i as f32 / (len-1) as f32;
+        output.push((params.eval(X) * 65535.) as u16);
+    }
+    output
+}
+
+pub(crate) fn build_lut_for_linear_from_tf(trc: &curveType,
+        lut_len: Option<usize>) -> Vec<u16> {
+    match trc {
+        curveType::Parametric(params) => {
+            let lut_len = lut_len.unwrap_or(256);
+            let params = Param::new(params);
+            to_lut(&params, lut_len)
+        },
+        curveType::Curve(data) => {
+            let autogen_lut_len = lut_len.unwrap_or(4096);
+            match data.len() {
+                0 => build_linear_table(autogen_lut_len),
+                1 => {
+                    let gamma = u8Fixed8Number_to_float(data[0]);
+                    build_pow_table(gamma, autogen_lut_len)
+                }
+                _ => {
+                    let lut_len = lut_len.unwrap_or(data.len());
+                    assert_eq!(lut_len, data.len());
+                    data.clone() // I feel bad about this.
+                }
+            }
+        },
+    }
+}
+
+pub(crate) fn build_lut_for_tf_from_linear(trc: &curveType) -> Option<Vec<u16>> {
+    match trc {
+        curveType::Parametric(params) => {
+            let lut_len = 256;
+            let params = Param::new(params);
+            if let Some(inv_params) = params.invert() {
+                return Some(to_lut(&inv_params, lut_len));
+            }
+            // else return None instead of fallthrough to generic lut inversion.
+            return None;
+        },
+        curveType::Curve(data) => {
+            let autogen_lut_len = 4096;
+            match data.len() {
+                0 => {
+                    return Some(build_linear_table(autogen_lut_len));
+                },
+                1 => {
+                    let gamma = 1. / u8Fixed8Number_to_float(data[0]);
+                    return Some(build_pow_table(gamma, autogen_lut_len));
+                },
+                _ => {},
+            }
+        },
+    }
+
+    let linear_from_tf = build_lut_for_linear_from_tf(trc, None);
+
+    //XXX: the choice of a minimum of 256 here is not backed by any theory,
+    //     measurement or data, however it is what lcms uses.
+    let inverted_lut_len = std::cmp::max(linear_from_tf.len(), 256);
+    Some(invert_lut(&linear_from_tf, inverted_lut_len))
+}
+
+pub(crate) fn build_output_lut(trc: &curveType) -> Option<Vec<u16>> {
+    build_lut_for_tf_from_linear(trc)
+}
-- 
cgit v1.2.3