summaryrefslogtreecommitdiffstats
path: root/js/src/intgemm
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/intgemm')
-rw-r--r--js/src/intgemm/IntegerGemmIntrinsic.cpp433
-rw-r--r--js/src/intgemm/IntegerGemmIntrinsic.h358
-rw-r--r--js/src/intgemm/README_MOZILLA18
-rw-r--r--js/src/intgemm/moz.build53
4 files changed, 862 insertions, 0 deletions
diff --git a/js/src/intgemm/IntegerGemmIntrinsic.cpp b/js/src/intgemm/IntegerGemmIntrinsic.cpp
new file mode 100644
index 0000000000..800e6153b7
--- /dev/null
+++ b/js/src/intgemm/IntegerGemmIntrinsic.cpp
@@ -0,0 +1,433 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+
+#include "intgemm/IntegerGemmIntrinsic.h"
+
+#include "mozilla/CheckedInt.h"
+#include "mozilla/IntegerPrintfMacros.h"
+
+#include <gemmology_fwd.h>
+
+#include "js/ErrorReport.h"
+#include "js/HeapAPI.h"
+#include "vm/ArrayBufferObject.h"
+#include "wasm/WasmBuiltins.h"
+#include "wasm/WasmInstance.h"
+#include "wasm/WasmLog.h"
+
+#if defined(USE_AVX2)
+# define SUPPORTED_ARCHS \
+ xsimd::arch_list<xsimd::avx2, xsimd::ssse3, xsimd::sse2>
+#elif defined(USE_SSSE3)
+# define SUPPORTED_ARCHS xsimd::arch_list<xsimd::ssse3, xsimd::sse2>
+#elif defined(USE_SSE2)
+# define SUPPORTED_ARCHS xsimd::arch_list<xsimd::sse2>
+#elif defined(USE_NEON) and defined(XSIMD_WITH_NEON64)
+# define SUPPORTED_ARCHS xsimd::arch_list<xsimd::neon64>
+#else
+# error no supported architecture
+#endif
+
+// Dispatch *at runtime* based on run-time hardware and compile-time
+// architectures.
+//
+// FIXME: Ideally we would not run the dispatch code at each function call.
+#define GEMMOLOGY_DISPATCH(FUNC) \
+ xsimd::dispatch<SUPPORTED_ARCHS>([](auto arch, auto... args) { \
+ return gemmology::Engine<decltype(arch)>::FUNC(args...); \
+ })
+
+struct JSContext;
+
+static constexpr uint32_t ARRAY_ALIGNMENT = 64;
+static constexpr uint32_t ROWS_A_MULTIPLIER = 1;
+static constexpr uint32_t COLUMNS_A_MULTIPLIER = 64;
+static constexpr uint32_t ROWS_B_MULTIPLIER = COLUMNS_A_MULTIPLIER;
+static constexpr uint32_t COLUMNS_B_MULTIPLIER = 8;
+static constexpr uint32_t SELECTED_COLUMNS_B_MULTIPLIER = 8;
+
+void ReportGemmError(JSContext* cx, const unsigned errorNumber) {
+ JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr, errorNumber);
+}
+
+size_t GetWasmRawBufferLength(const uint8_t* memBase) {
+ const js::WasmArrayRawBuffer* rawBuf =
+ js::WasmArrayRawBuffer::fromDataPtr(memBase);
+ return rawBuf->byteLength();
+}
+
+bool CheckMatrixDimension(JSContext* cx, uint32_t size,
+ uint32_t sizeMultiplier) {
+ // A valid size is a positive integral multiple of Multiplier
+ if ((size == 0) || (size % sizeMultiplier != 0)) {
+ js::wasm::Log(
+ cx, "Invalid dimension value:%" PRIu32 " (should be a multiple of %u)",
+ size, sizeMultiplier);
+ return false;
+ }
+ return true;
+}
+
+bool CheckMatrixBound(JSContext* cx, uint32_t input, uint64_t inputSize,
+ size_t wasmBufferSize) {
+ mozilla::CheckedUint64 inputUpperLimit(inputSize);
+ inputUpperLimit += input;
+
+ // Bound check fails if size overflows or it spans outside the wasm memory
+ if (!inputUpperLimit.isValid() ||
+ (inputUpperLimit.value() >= (uint64_t)wasmBufferSize)) {
+ js::wasm::Log(cx, "Memory out of wasm bounds for matrix:%" PRIu32, input);
+ return false;
+ }
+ return true;
+}
+
+bool CheckMatrixBoundAndAlignment(JSContext* cx, uint32_t input,
+ uint64_t inputSize, size_t wasmBufferSize) {
+ // Alignment check: It is sufficient to check alignment for the offset rather
+ // than for the actual pointer within wasm memory (as long as following assert
+ // is satisfied)
+ static_assert(js::gc::PageSize >= ARRAY_ALIGNMENT,
+ "PageSize should be bigger than Alignment");
+ if (input % ARRAY_ALIGNMENT != 0) {
+ js::wasm::Log(
+ cx, "Unaligned access for matrix:%" PRIu32 " (should be %u aligned)",
+ input, ARRAY_ALIGNMENT);
+ return false;
+ }
+
+ // Check Bound
+ return CheckMatrixBound(cx, input, inputSize, wasmBufferSize);
+}
+
+int32_t js::intgemm::IntrI8PrepareB(wasm::Instance* instance,
+ uint32_t inputMatrixB, float scale,
+ float zeroPoint, uint32_t rowsB,
+ uint32_t colsB, uint32_t outputMatrixB,
+ uint8_t* memBase) {
+ MOZ_ASSERT(wasm::SASigIntrI8PrepareB.failureMode ==
+ wasm::FailureMode::FailOnNegI32);
+ JSContext* cx = instance->cx();
+
+ // Size checks for matricies
+ if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+ !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+ wasm::Log(cx, "%s: rowsB:%" PRIu32 " colsB:%" PRIu32, __FUNCTION__, rowsB,
+ colsB);
+ ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+ return -1;
+ }
+
+ // Memory Bound and Alignment checks for matricies
+ uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+ size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+ if (!CheckMatrixBoundAndAlignment(cx, inputMatrixB, sizeB, wasmBufferSize) ||
+ !CheckMatrixBoundAndAlignment(cx, outputMatrixB, sizeB, wasmBufferSize)) {
+ wasm::Log(cx,
+ "%s: inputB:%x rowsB:%" PRIu32 " colsB:%" PRIu32
+ " outputB:%x sizeB:%" PRIu64 " wasmBufferSize:%zu",
+ __FUNCTION__, inputMatrixB, rowsB, colsB, outputMatrixB, sizeB,
+ wasmBufferSize);
+ ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+ return -1;
+ }
+
+ // Actual call to the 3rd party library (intgemm) for PrepareB
+ uint8_t* inputMatrixBPtr = &memBase[inputMatrixB];
+ uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
+ GEMMOLOGY_DISPATCH(PrepareB)
+ ((const float*)inputMatrixBPtr, (int8_t*)outputMatrixBPtr,
+ (float)scale, // Quant Mult
+ rowsB, colsB);
+ return 0;
+}
+
+int32_t js::intgemm::IntrI8PrepareBFromTransposed(
+ wasm::Instance* instance, uint32_t inputMatrixBTransposed, float scale,
+ float zeroPoint, uint32_t rowsB, uint32_t colsB, uint32_t outputMatrixB,
+ uint8_t* memBase) {
+ MOZ_ASSERT(wasm::SASigIntrI8PrepareBFromTransposed.failureMode ==
+ wasm::FailureMode::FailOnNegI32);
+ JSContext* cx = instance->cx();
+
+ // Size checks for matricies
+ if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+ !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+ wasm::Log(cx, "%s: rowsB:%" PRIu32 " colsB:%" PRIu32, __FUNCTION__, rowsB,
+ colsB);
+ ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+ return -1;
+ }
+
+ // Memory Bound checks for all matricies
+ uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+ size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+ if (!CheckMatrixBoundAndAlignment(cx, inputMatrixBTransposed, sizeB,
+ wasmBufferSize) ||
+ !CheckMatrixBoundAndAlignment(cx, outputMatrixB, sizeB, wasmBufferSize)) {
+ wasm::Log(cx,
+ "%s: inputBT:%x rowsB:%" PRIu32 " colsB:%" PRIu32
+ " outputB:%x sizeB:%" PRIu64 " wasmBufferSize:%zu",
+ __FUNCTION__, inputMatrixBTransposed, rowsB, colsB, outputMatrixB,
+ sizeB, wasmBufferSize);
+ ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+ return -1;
+ }
+
+ // Actual call to the 3rd party library (intgemm) for PrepareBTransposed
+ uint8_t* inputMatrixBTransposedPtr = &memBase[inputMatrixBTransposed];
+ uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
+ GEMMOLOGY_DISPATCH(PrepareBTransposed)
+ ((const float*)inputMatrixBTransposedPtr, (int8_t*)outputMatrixBPtr,
+ (float)scale, // Quant Mult
+ rowsB, colsB);
+ return 0;
+}
+
+int32_t js::intgemm::IntrI8PrepareBFromQuantizedTransposed(
+ wasm::Instance* instance, uint32_t inputMatrixBQuantizedTransposed,
+ uint32_t rowsB, uint32_t colsB, uint32_t outputMatrixB, uint8_t* memBase) {
+ MOZ_ASSERT(wasm::SASigIntrI8PrepareBFromQuantizedTransposed.failureMode ==
+ wasm::FailureMode::FailOnNegI32);
+ JSContext* cx = instance->cx();
+
+ // Size checks for matricies
+ if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+ !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+ wasm::Log(cx, "%s: rowsB:%" PRIu32 " colsB:%" PRIu32, __FUNCTION__, rowsB,
+ colsB);
+ ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+ return -1;
+ }
+
+ // Memory Bound checks for all matricies
+ uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+ size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+ if (!CheckMatrixBoundAndAlignment(cx, inputMatrixBQuantizedTransposed, sizeB,
+ wasmBufferSize) ||
+ !CheckMatrixBoundAndAlignment(cx, outputMatrixB, sizeB, wasmBufferSize)) {
+ wasm::Log(cx,
+ "%s: inputBQT:%x rowsB:%" PRIu32 " colsB:%" PRIu32
+ " outputB:%x sizeA:%" PRIu64 " wasmBufferSize:%zu",
+ __FUNCTION__, inputMatrixBQuantizedTransposed, rowsB, colsB,
+ outputMatrixB, sizeB, wasmBufferSize);
+ ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+ return -1;
+ }
+
+ // Actual call to the 3rd party library (intgemm)
+ uint8_t* inputMatrixBQuantizedTransposedPtr =
+ &memBase[inputMatrixBQuantizedTransposed];
+ uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
+ GEMMOLOGY_DISPATCH(PrepareBQuantizedTransposed)
+ ((const int8_t*)inputMatrixBQuantizedTransposedPtr, (int8_t*)outputMatrixBPtr,
+ rowsB, colsB);
+ return 0;
+}
+
+int32_t js::intgemm::IntrI8PrepareA(wasm::Instance* instance,
+ uint32_t inputMatrixA, float scale,
+ float zeroPoint, uint32_t rowsA,
+ uint32_t colsA, uint32_t outputMatrixA,
+ uint8_t* memBase) {
+ MOZ_ASSERT(wasm::SASigIntrI8PrepareA.failureMode ==
+ wasm::FailureMode::FailOnNegI32);
+ JSContext* cx = instance->cx();
+
+ // Size checks for matricies
+ if (!CheckMatrixDimension(cx, rowsA, ROWS_A_MULTIPLIER) ||
+ !CheckMatrixDimension(cx, colsA, COLUMNS_A_MULTIPLIER)) {
+ wasm::Log(cx, "%s: rowsA:%" PRIu32 " colsA:%" PRIu32, __FUNCTION__, rowsA,
+ colsA);
+ ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+ return -1;
+ }
+
+ // Memory Bound checks for all matricies
+ uint64_t sizeA = (uint64_t)rowsA * (uint64_t)colsA;
+ size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+ if (!CheckMatrixBoundAndAlignment(cx, inputMatrixA, sizeA, wasmBufferSize) ||
+ !CheckMatrixBoundAndAlignment(cx, outputMatrixA, sizeA, wasmBufferSize)) {
+ wasm::Log(cx,
+ "%s: inputA:%x rowsA:%" PRIu32 " colsA:%" PRIu32
+ " outputA:%x sizeA:%" PRIu64 " wasmBufferSize:%zu",
+ __FUNCTION__, inputMatrixA, rowsA, colsA, outputMatrixA, sizeA,
+ wasmBufferSize);
+ ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+ return -1;
+ }
+
+ // Actual call to the 3rd party library (intgemm)
+ uint8_t* inputMatrixAPtr = &memBase[inputMatrixA];
+ uint8_t* outputMatrixAPtr = &memBase[outputMatrixA];
+ GEMMOLOGY_DISPATCH(Shift::PrepareA)
+ ((const float*)inputMatrixAPtr, outputMatrixAPtr, scale, rowsA, colsA);
+ return 0;
+}
+
+int32_t js::intgemm::IntrI8PrepareBias(
+ wasm::Instance* instance, uint32_t inputMatrixBPrepared, float scaleA,
+ float zeroPointA, float scaleB, float zeroPointB, uint32_t rowsB,
+ uint32_t colsB, uint32_t inputBias, uint32_t output, uint8_t* memBase) {
+ MOZ_ASSERT(wasm::SASigIntrI8PrepareBias.failureMode ==
+ wasm::FailureMode::FailOnNegI32);
+ JSContext* cx = instance->cx();
+
+ // Size checks for matricies
+ if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+ !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+ wasm::Log(cx, "%s: rowsB:%" PRIu32 " colsB:%" PRIu32, __FUNCTION__, rowsB,
+ colsB);
+ ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+ return -1;
+ }
+
+ // Memory Bound checks for all matricies
+ uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+ uint64_t sizeBias = colsB;
+ size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+ if (!CheckMatrixBoundAndAlignment(cx, inputMatrixBPrepared, sizeB,
+ wasmBufferSize) ||
+ !CheckMatrixBound(cx, inputBias, sizeBias, wasmBufferSize) ||
+ !CheckMatrixBound(cx, output, sizeBias, wasmBufferSize)) {
+ wasm::Log(cx,
+ "%s: preparedB:%x rowsB:%" PRIu32 " colsB:%" PRIu32
+ " inputBias:%x outputBias:%x sizeB:%" PRIu64
+ " wasmBufferSize:%zu",
+ __FUNCTION__, inputMatrixBPrepared, rowsB, colsB, inputBias,
+ output, sizeB, wasmBufferSize);
+ ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+ return -1;
+ }
+
+ // Actual call to the 3rd party library (intgemm)
+ uint8_t* inputMatrixBPreparedPtr = &memBase[inputMatrixBPrepared];
+ uint8_t* inputBiasPtr = &memBase[inputBias];
+ uint8_t* outputPtr = &memBase[output];
+ float unquantFactor =
+ (-1) * ((127.0f / scaleA) * (127.0f / scaleB)) / (127.0f);
+ GEMMOLOGY_DISPATCH(Shift::PrepareBias)
+ ((const int8_t*)inputMatrixBPreparedPtr, rowsB, colsB,
+ gemmology::callbacks::UnquantizeAndAddBiasAndWrite(
+ unquantFactor, (const float*)inputBiasPtr, (float*)outputPtr));
+ return 0;
+}
+
+int32_t js::intgemm::IntrI8MultiplyAndAddBias(
+ wasm::Instance* instance, uint32_t inputMatrixAPrepared, float scaleA,
+ float zeroPointA, uint32_t inputMatrixBPrepared, float scaleB,
+ float zeroPointB, uint32_t inputBiasPrepared, float unquantMultiplier,
+ uint32_t rowsA, uint32_t width, uint32_t colsB, uint32_t output,
+ uint8_t* memBase) {
+ MOZ_ASSERT(wasm::SASigIntrI8MultiplyAndAddBias.failureMode ==
+ wasm::FailureMode::FailOnNegI32);
+ JSContext* cx = instance->cx();
+
+ // Size checks for matricies
+ if (!CheckMatrixDimension(cx, rowsA, ROWS_A_MULTIPLIER) ||
+ !CheckMatrixDimension(cx, width, COLUMNS_A_MULTIPLIER) ||
+ !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+ wasm::Log(cx, "%s: rowsA:%" PRIu32 " width:%" PRIu32 " colsB:%" PRIu32,
+ __FUNCTION__, rowsA, width, colsB);
+ ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+ return -1;
+ }
+
+ // Memory Bound checks for all matricies
+ uint64_t sizeA = (uint64_t)rowsA * (uint64_t)width;
+ uint64_t sizeB = (uint64_t)width * (uint64_t)colsB;
+ uint64_t sizeBias = (uint64_t)colsB;
+ uint64_t sizeOutput = (uint64_t)rowsA * (uint64_t)colsB;
+ size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+ if (!CheckMatrixBoundAndAlignment(cx, inputMatrixAPrepared, sizeA,
+ wasmBufferSize) ||
+ !CheckMatrixBoundAndAlignment(cx, inputMatrixBPrepared, sizeB,
+ wasmBufferSize) ||
+ !CheckMatrixBound(cx, inputBiasPrepared, sizeBias, wasmBufferSize) ||
+ !CheckMatrixBound(cx, output, sizeOutput, wasmBufferSize)) {
+ wasm::Log(cx,
+ "%s: preparedA:%x preparedB:%x preparedBias:%x rowsA:%" PRIu32
+ " width:%" PRIu32 " colsB:%" PRIu32
+ " output:%x sizeA:%" PRIu64 " sizeB:%" PRIu64
+ " sizeBias:%" PRIu64 " sizeOutput:%" PRIu64,
+ __FUNCTION__, inputMatrixAPrepared, inputMatrixBPrepared,
+ inputBiasPrepared, rowsA, width, colsB, output, sizeA, sizeB,
+ sizeBias, sizeOutput);
+ ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+ return -1;
+ }
+
+ // Actual call to the 3rd party library (intgemm)
+ uint8_t* inputMatrixAPreparedPtr = &memBase[inputMatrixAPrepared];
+ uint8_t* inputMatrixBPreparedPtr = &memBase[inputMatrixBPrepared];
+ uint8_t* inputBiasPreparedPtr = &memBase[inputBiasPrepared];
+ uint8_t* outputPtr = &memBase[output];
+ float unquantFactor = unquantMultiplier / (scaleA * scaleB);
+
+ GEMMOLOGY_DISPATCH(Shift::Multiply)
+ (inputMatrixAPreparedPtr, (const int8_t*)inputMatrixBPreparedPtr, rowsA,
+ width, colsB,
+ gemmology::callbacks::UnquantizeAndAddBiasAndWrite(
+ unquantFactor, (const float*)inputBiasPreparedPtr, (float*)outputPtr));
+ return 0;
+}
+
+int32_t js::intgemm::IntrI8SelectColumnsOfB(wasm::Instance* instance,
+ uint32_t inputMatrixBPrepared,
+ uint32_t rowsB, uint32_t colsB,
+ uint32_t colIndexList,
+ uint32_t sizeColIndexList,
+ uint32_t output, uint8_t* memBase) {
+ MOZ_ASSERT(wasm::SASigIntrI8SelectColumnsOfB.failureMode ==
+ wasm::FailureMode::FailOnNegI32);
+ JSContext* cx = instance->cx();
+
+ // Size checks for matricies
+ if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+ !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER) ||
+ !CheckMatrixDimension(cx, sizeColIndexList,
+ SELECTED_COLUMNS_B_MULTIPLIER)) {
+ wasm::Log(cx,
+ "%s: rowsB:%" PRIu32 " colsB:%" PRIu32
+ " sizeColIndexList:%" PRIu32,
+ __FUNCTION__, rowsB, colsB, sizeColIndexList);
+ ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+ return -1;
+ }
+
+ // Memory Bound checks for all matricies
+ uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+ uint64_t sizeOutput = (uint64_t)rowsB * (uint64_t)sizeColIndexList;
+ size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+ if (!CheckMatrixBoundAndAlignment(cx, inputMatrixBPrepared, sizeB,
+ wasmBufferSize) ||
+ !CheckMatrixBound(cx, colIndexList, sizeColIndexList, wasmBufferSize) ||
+ !CheckMatrixBound(cx, output, sizeOutput, wasmBufferSize)) {
+ wasm::Log(cx,
+ "%s: preparedB:%x rowsB:%" PRIu32 " colsB:%" PRIu32
+ " colList:%x sizeColList:%" PRIu32 " output:%x sizeB:%" PRIu64
+ " sizeOutput:%" PRIu64,
+ __FUNCTION__, inputMatrixBPrepared, rowsB, colsB, colIndexList,
+ sizeColIndexList, output, sizeB, sizeOutput);
+ ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+ return -1;
+ }
+
+ // Actual call to the 3rd party library (intgemm)
+ uint8_t* inputMatrixBPreparedPtr = &memBase[inputMatrixBPrepared];
+ uint8_t* colIndexListPtr = &memBase[colIndexList];
+ uint8_t* outputPtr = &memBase[output];
+ GEMMOLOGY_DISPATCH(SelectColumnsB)
+ ((const int8_t*)inputMatrixBPreparedPtr, (int8_t*)outputPtr, rowsB,
+ (const uint32_t*)colIndexListPtr,
+ (const uint32_t*)colIndexListPtr + sizeColIndexList);
+ return 0;
+}
+
+#undef GEMMOLOGY_DISPATCH
+#undef SUPPORTED_ARCHS
diff --git a/js/src/intgemm/IntegerGemmIntrinsic.h b/js/src/intgemm/IntegerGemmIntrinsic.h
new file mode 100644
index 0000000000..2cbb70853f
--- /dev/null
+++ b/js/src/intgemm/IntegerGemmIntrinsic.h
@@ -0,0 +1,358 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+
+#ifndef intgemm_IntegerGemmIntrinsic_h
+#define intgemm_IntegerGemmIntrinsic_h
+
+#include <stdint.h>
+
+namespace js {
+namespace wasm {
+class Instance;
+}
+
+namespace intgemm {
+
+/* Interface for integer matrix multiplication followed by addition of bias.
+ *
+ * C = A * B + Bias
+ *
+ * Input matrix A:
+ * - A 2-D matrix that typically represents activations as floating point
+ * values
+ * - no. of rows should be a positive integer
+ * - no. of columns should be a positive integeral multiple of 64
+ * - is represented as array (contiguous memory locations) in row-major format
+ *
+ * Input matrix B:
+ * - A 2-D matrix that typically represents fixed model parameters as
+ * floating point values
+ * - no. of rows should be:
+ * -- equal to no. of columns of Input matrix A
+ * -- a positive integeral multiple of 64
+ * - no. of columns should be a positive integeral multiple of 8
+ * - is represented as array (contiguous memory locations) in row-major format
+ *
+ * Please note that it is also possible to pass Input matrix B in 2 more forms:
+ * - One that is already a quantized and transposed version of Input matrix B
+ * - Other that is already a transposed version of Input matrix B
+ *
+ * Input Bias:
+ * - is an array (contiguous memory locations) that represents bias
+ * - size of the array should be equal to the no. of columns of Input matrix B
+ *
+ * Output matrix C:
+ * - is a 2-D matrix that represents the result (= A * B + Bias)
+ * - no. of rows = no. of rows of Input matrix A
+ * - no. of columns = no. of columns of Input matrix B (in
+ * untransposed form)
+ * - is represented as array (contiguous memory locations) in row-major format
+ *
+ * Please note that most of the functions in this interface might have
+ * architecture specific implementations.
+ *
+ * Conventions followed for the interface:
+ * - Unless explicitly mentioned, Input matrix B refers to an unquantized
+ * (i.e. float values) and non-transposed version
+ * - no. of rows of Input matrix A = `rowsA`
+ * - no. of columns of Input matrix A (`colsA`) = no. of rows of Input matrix B
+ * (`rowsB`) = `width`
+ * - no. of columns of Input matrix B = `colsB`
+ */
+
+/* Prepare B for the Matrix Multiply function from Input matrix B.
+ *
+ * Quantization is performed on the input.
+ * The final prepared B is in CPU-dependent format and can be used as an input
+ * to matrix multiply function (`int8_multiply_and_add_bias`).
+ *
+ * Please note that this interface might have architecture specific
+ * implementation.
+ *
+ * @param[in] inputMatrixB An array representing the Input matrix B in
+ * row-major format.
+ * Size of the array = `rowsB` * `colsB`.
+ * Shape of the matrix: (`rowsB`, `colsB`)
+ * @param[in] scale The scaling factor (for quantization)
+ * @param[in] zeroPoint The zero point (for quantization)
+ * @param[in] rowsB No. of rows of Input matrix B. It should be
+ * a positive integer and a multiple of 64.
+ * @param[in] colsB No. of columns of Input matrix B. It should
+ * be a positive integer and a multiple of 8.
+ * @param[out] outputMatrixB An array representing the prepared B matrix.
+ * Size of the array = `rowsB` * `colsB`.
+ *
+ * This function implements the intrinsic:
+ * int8_prepare_b(inputMatrixB: i32, scale: f32, zeroPoint: f32, rowsB: i32,
+ * colsB: i32, outputMatrixB: i32) which implements the function:
+ * int8_prepare_b(const float* inputMatrixB, float scale, float zeroPoint,
+ * uint32_t rowsB, uint32_t colsB, int8_t* outputMatrixB)
+ */
+int32_t IntrI8PrepareB(wasm::Instance* instance, uint32_t inputMatrixB,
+ float scale, float zeroPoint, uint32_t rowsB,
+ uint32_t colsB, uint32_t outputMatrixB,
+ uint8_t* memBase);
+
+/* Prepare B for the Matrix Multiply function from transposed version of Input
+ * matrix B.
+ *
+ * Quantization is performed on floating values of input.
+ * The final prepared B is in CPU-dependent format and can be used as an input
+ * to matrix multiply function (`int8_multiply_and_add_bias`).
+ *
+ * Please note that this interface might have architecture specific
+ * implementation.
+ *
+ * @param[in] inputMatrixBTransposed An array representing transposed version
+ * of Input matrix B.
+ * It is in column-major format.
+ * Size of the array = `rowsB` * `colsB`.
+ * Shape of the matrix: (`colsB`, `rowsB`)
+ * @param[in] scale The scaling factor (for quantization)
+ * @param[in] zeroPoint The zero point (for quantization)
+ * @param[in] rowsB No. of rows of Input matrix B. It should
+ * be a positive integer and a multiple of
+ * 64.
+ * @param[in] colsB No. of columns of Input matrix B. It
+ * should be a positive integer and a
+ * multiple of 8.
+ * @param[out] outputMatrixB An array representing the prepared B
+ * matrix. Size of array = `rowsB`*`colsB`
+ *
+ * This function implements the intrinsic:
+ * int8_prepare_b_from_transposed(inputMatrixBTransposed: i32, scale: f32,
+ * zeroPoint: f32, rowsB: i32, colsB: i32, outputMatrixB: i32) which implements
+ * the function: int8_prepare_b_from_transposed(const float*
+ * inputMatrixBTransposed, float scale, float zeroPoint, uint32_t rowsB,
+ * uint32_t colsB, int8_t* outputMatrixB)
+ */
+int32_t IntrI8PrepareBFromTransposed(wasm::Instance* instance,
+ uint32_t inputMatrixBTransposed,
+ float scale, float zeroPoint,
+ uint32_t rowsB, uint32_t colsB,
+ uint32_t outputMatrixB, uint8_t* memBase);
+
+/* Prepare B for the Matrix Multiply function from a quantized and transposed
+ * version of Input matrix B which is also in a CPU-independent format.
+ *
+ * The final prepared B is in CPU-dependent format and can be used as an input
+ * to matrix multiply function (`int8_multiply_and_add_bias`).
+ *
+ * This function is useful while using the quantized models that are stored in a
+ * CPU-independent format on the disk.
+ *
+ * @param[in] inputMatrixBQuantizedTransposed An array representing the
+ * quantized and transposed
+ * version of Input matrix B.
+ * It is in column-major format.
+ * Size of array =
+ * `rowsB`*`colsB`
+ * Shape of the matrix:
+ * (`colsB`,`rowsB`)
+ * @param[in] rowsB No. of rows of Input matrix B.
+ * Should be a positive integer
+ * and a multiple of 64.
+ * @param[in] colsB No. of columns of Input matrix
+ * B. Should be a positive
+ * integer and a multiple of 8
+ * @param[out] outputMatrixB An array representing the
+ * prepared B matrix.
+ * Size: `rowsB` * `colsB`.
+ *
+ * This function implements the intrinsic:
+ * int8_prepare_b_from_quantized_transposed(inputMatrixBQuantizedTransposed:
+ * i32, rowsB: i32, colsB: i32, outputMatrixB: i32) which implements the
+ * function: int8_prepare_b_from_quantized_transposed(const int8_t*
+ * inputMatrixBQuantizedTransposed, uint32_t rowsB, uint32_t colsB, int8_t*
+ * outputMatrixB)
+ */
+int32_t IntrI8PrepareBFromQuantizedTransposed(
+ wasm::Instance* instance, uint32_t inputMatrixBQuantizedTransposed,
+ uint32_t rowsB, uint32_t colsB, uint32_t outputMatrixB, uint8_t* memBase);
+
+/* Prepare A for the Matrix Multiply function from Input matrix A.
+ *
+ * It performs quantization on floating values of input.
+ * The final prepared A might be architecture dependent. e.g. On some
+ * architectures like x86, it might be unsigned (achieved by adding 127 to
+ * quantized values) while on others like Arm, it might be signed. The final
+ * prepared A can be used as an input to matrix multiply function
+ * (`int8_multiply_and_add_bias`).
+ *
+ * Please note that this interface might have architecture specific
+ * implementation.
+ *
+ * @param[in] inputMatrixA An array representing the Input matrix A in
+ * row-major format.
+ * Size of the array = `rowsA` * `colsA`.
+ * Shape of the matrix: (`rowsA`, `colsA`)
+ * @param[in] scale The scaling factor (for quantization)
+ * @param[in] zeroPoint The zero point (for quantization)
+ * @param[in] rowsA No. of rows of Input matrix A. It should be a
+ * positive integer.
+ * @param[in] colsA No. of columns of Input matrix A. It should be a
+ * positive integer and a multiple of 64.
+ * @param[out] outputMatrixA An array representing the prepared A matrix.
+ * Size of the array = `rowsA` * `colsA`.
+ *
+ * This function implements the intrinsic:
+ * int8_prepare_a(inputMatrixA: i32, scale: f32, zeroPoint: f32, rowsA: i32,
+ * colsA: i32, outputMatrixA: i32) which implements the function:
+ * int8_prepare_a(const float* inputMatrixA, float scale, float zeroPoint,
+ * uint32_t rowsA, uint32_t colsA, int8_t* outputMatrixA)
+ */
+int32_t IntrI8PrepareA(wasm::Instance* instance, uint32_t inputMatrixA,
+ float scale, float zeroPoint, uint32_t rowsA,
+ uint32_t colsA, uint32_t outputMatrixA,
+ uint8_t* memBase);
+
+/* Prepares bias for the Matrix Multiply function.
+ *
+ * It uses the prepared B (which must be obtained by using any of the
+ * int8_prepare_b* functions) and a bias input to prepare the final bias.
+ *
+ * The final bias can be used as an input to matrix multiply function
+ * (`int8_multiply_and_add_bias`).
+ *
+ * @param[in] inputMatrixBPrepared An array representing the prepared B
+ * matrix. Size of array = `rowsB`*`colsB`.
+ * @param[in] scaleA The scaling factor (for quantization) of A
+ * @param[in] zeroPointA The zero point (for quantization) of A
+ * @param[in] scaleB The scaling factor (for quantization) of B
+ * @param[in] zeroPointB The zero point (for quantization) of B
+ * @param[in] rowsB No. of rows of Input matrix B (unquantized
+ * & non-transposed). It should be a positive
+ * integer and a multiple of 64.
+ * @param[in] colsB No. of columns of Input matrix B
+ * (unquantized & non-transposed). It should
+ * be a positive integer and a multiple of 8.
+ * @param[in] inputBias An array representing the input bias. Size
+ * of array = `colsB`
+ * @param[out] output An array representing the final prepared
+ * bias. Size of the array = `colsB`
+ *
+ * This function implements the intrinsic:
+ * int8_prepare_bias(inputMatrixBPrepared: i32, scaleA: f32, zeroPointA: f32,
+ * scaleB: f32, zeroPointB: f32, rowsB: i32, colsB: i32, inputBias: i32, output:
+ * i32) which implements the function: int8_prepare_bias(const int8_t*
+ * inputMatrixBPrepared, float scaleA, float zeroPointA, float scaleB, float
+ * zeroPointB, uint32_t rowsB, uint32_t colsB, const float* inputBias, float*
+ * output)
+ */
+int32_t IntrI8PrepareBias(wasm::Instance* instance,
+ uint32_t inputMatrixBPrepared, float scaleA,
+ float zeroPointA, float scaleB, float zeroPointB,
+ uint32_t rowsB, uint32_t colsB, uint32_t inputBias,
+ uint32_t output, uint8_t* memBase);
+
+/* Perform multiplication of 2 matrices followed by adding a bias.
+ *
+ * i.e Output = inputMatrixAPrepared * inputMatrixBPrepared + inputBiasPrepared
+ *
+ * The inputs inputMatrixAPrepared, inputMatrixBPrepared and inputBiasPrepared
+ * of this function must be obtained by using `int8_prepare_A`, one of the
+ * `int8_prepare_b*` and `int8_prepare_bias` functions respectively.
+ *
+ * Please note that this interface might have architecture specific
+ * implementation.
+ *
+ * @param[in] inputMatrixAPrepared An array representing the prepared A
+ * matrix. This must be obtained by using
+ * `int8_prepare_A` function. Size of the
+ * array = `rowsA` * `width`.
+ * @param[in] scaleA The scaling factor (quantization) of A
+ * @param[in] zeroPointA The zero point (for quantization) of A
+ * @param[in] inputMatrixBPrepared An array representing the prepared B
+ * matrix. This must be obtained by using
+ * one of `int8_prepare_b*` functions.
+ * Size of the array = `width` * `colsB`.
+ * @param[in] scaleB The scaling factor (quantization) of B
+ * @param[in] zeroPointB The zero point (for quantization) of B
+ * @param[in] inputBiasPrepared An array representing the prepared bias.
+ * This must be obtained by using
+ * `int8_prepare_bias` function.
+ * Size of the array = `colsB`
+ * @param[in] unquantMultiplier A value that will be multiplied to the
+ * final unquantization factor that is
+ * prepared from `scaleA` and `scaleB`.
+ * @param[in] rowsA No. of rows of Input matrix A. It should
+ * be a positive integer.
+ * @param[in] width No. of columns of Input matrix A (same as
+ * no. of columns of Input matrix B). It
+ * should be a positive integer and a
+ * multiple of 64.
+ * @param[in] colsB No. of columns of Input matrix B. Should
+ * be a multiple of 8.
+ * @param[out] output An array representing the result matrix
+ * in row-major format.
+ * Size of the array = `rowsA` * `colsB`.
+ *
+ * This function implements the intrinsic:
+ * int8_multiply_and_add_bias(inputMatrixAPrepared: i32, scaleA: f32,
+ * zeroPointA: f32, inputMatrixBPrepared: i32, scaleB: f32, zeroPointB: f32,
+ * inputBiasPrepared: i32, unquantMultiplier: f32,
+ * rowsA: i32, width: i32, colsB: i32, output: i32)
+ * which implements the function:
+ * int8_multiply_and_add_bias(const int8_t* inputMatrixAPrepared, float
+ * scaleA, float zeroPointA, const int8_t* inputMatrixBPrepared, float scaleB,
+ * float zeroPointB, const float* inputBiasPrepared, float unquantMultiplier,
+ * uint32_t rowsA, uint32_t width, uint32_t colsB, float*
+ * output)
+ */
+int32_t IntrI8MultiplyAndAddBias(wasm::Instance* instance,
+ uint32_t inputMatrixAPrepared, float scaleA,
+ float zeroPointA,
+ uint32_t inputMatrixBPrepared, float scaleB,
+ float zeroPointB, uint32_t inputBiasPrepared,
+ float unquantMultiplier, uint32_t rowsA,
+ uint32_t width, uint32_t colsB,
+ uint32_t output, uint8_t* memBase);
+
+/* Select a subset of columns of prepared B.
+ *
+ * Indices of the columns to be selected are specified by an array.
+ *
+ * @param[in] inputMatrixBPrepared An array representing the prepared B
+ * matrix. This must be obtained by using
+ * one of the `int8_prepare_b*` functions.
+ * Size of the array = `rowsB` * `colsB`.
+ * @param[in] rowsB No. of rows of Input matrix B. It should
+ * be a positive integer and a multiple
+ * of 64.
+ * @param[in] colsB No. of columns of Input matrix B. It
+ * should be a positive integer and a
+ * multiple of 8.
+ * @param[in] colIndexList An array of column indices to be selected
+ * from prepared B. All indices of the array
+ * should be valid
+ * i.e. 0 <= colIndexList[N] < colsB
+ * where N = 0, 1 ....(`sizeColIndexList`-1)
+ * @param[in] sizeColIndexList Size of the `colIndexList` array. It
+ * should be a positive integer and a
+ * multiple of 8.
+ * @param[out] output An array representing the selected columns
+ * of prepared B.
+ * Size = `rowsB` * `sizeColIndexList`.
+ *
+ * This function implements the intrinsic:
+ * int8_select_columns_of_b(inputMatrixBPrepared: i32, rowsB: i32, colsB: i32,
+ * colIndexList: i32, sizeColIndexList: i32, output: i32) which implements the
+ * function: int8_select_columns_of_b(const int8_t* inputMatrixBPrepared,
+ * uint32_t rowsB, uint32_t colsB, const uint32_t* colIndexList, const uint32_t
+ * sizeColIndexList, int8_t* output)
+ */
+int32_t IntrI8SelectColumnsOfB(wasm::Instance* instance,
+ uint32_t inputMatrixBPrepared, uint32_t rowsB,
+ uint32_t colsB, uint32_t colIndexList,
+ uint32_t sizeColIndexList, uint32_t output,
+ uint8_t* memBase);
+
+} // namespace intgemm
+} // namespace js
+
+#endif // intgemm_IntegerGemmIntrinsic_h
diff --git a/js/src/intgemm/README_MOZILLA b/js/src/intgemm/README_MOZILLA
new file mode 100644
index 0000000000..3cf857b166
--- /dev/null
+++ b/js/src/intgemm/README_MOZILLA
@@ -0,0 +1,18 @@
+This directory contains build files for the gemmology reference implementation.
+The actual library source is in $TOPSRCDIR/third_party/gemmology/
+
+Any patches or additional configuration to be applied to the
+upstream source should be kept in $TOPSRCDIR/third_party/gemmology/.
+
+To update the library source and build config files, execute
+
+ ./mach vendor third_party/gemmology/moz.yaml
+
+To update to a specific upstream git tag or commit, use
+
+ ./mach vendor third_party/gemmology/moz.yaml -r <commit>
+
+The upstream git repository is https://github.com/mozilla/gemmology
+
+To view the information about the current version, check the
+'origin' section of moz.yaml.
diff --git a/js/src/intgemm/moz.build b/js/src/intgemm/moz.build
new file mode 100644
index 0000000000..04e8213d3e
--- /dev/null
+++ b/js/src/intgemm/moz.build
@@ -0,0 +1,53 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+include("../js-config.mozbuild")
+include("../js-cxxflags.mozbuild")
+
+FINAL_LIBRARY = "js"
+
+with Files("*"):
+ BUG_COMPONENT = ("Core", "JavaScript: WebAssembly")
+
+LOCAL_INCLUDES += [
+ "!..",
+ "..",
+ "/third_party/gemmology",
+ "/third_party/xsimd/include",
+]
+
+if CONFIG["INTEL_ARCHITECTURE"]:
+ DEFINES["USE_SSE2"] = True
+ SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineSSE2.cpp"]
+ SOURCES["/third_party/gemmology/kernels/GemmologyEngineSSE2.cpp"].flags += CONFIG[
+ "SSE2_FLAGS"
+ ]
+ if CONFIG["SSSE3_FLAGS"]:
+ DEFINES["USE_SSSE3"] = True
+ SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineSSSE3.cpp"]
+ SOURCES[
+ "/third_party/gemmology/kernels/GemmologyEngineSSSE3.cpp"
+ ].flags += CONFIG["SSSE3_FLAGS"]
+ if CONFIG["AVX2_FLAGS"]:
+ DEFINES["USE_AVX2"] = True
+ SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineAVX2.cpp"]
+ SOURCES[
+ "/third_party/gemmology/kernels/GemmologyEngineAVX2.cpp"
+ ].flags += CONFIG["AVX2_FLAGS"]
+
+if CONFIG["CPU_ARCH"] == "aarch64":
+ DEFINES["USE_NEON"] = True
+ SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineNeon64.cpp"]
+ SOURCES["/third_party/gemmology/kernels/GemmologyEngineNeon64.cpp"].flags += CONFIG[
+ "NEON_FLAGS"
+ ]
+
+SOURCES += [
+ "IntegerGemmIntrinsic.cpp",
+]
+
+# We allow warnings for third-party code that can be updated from upstream.
+AllowCompilerWarnings()