3 files changed, 566 insertions, 0 deletions
diff --git a/third_party/intgemm/benchmarks/benchmark.cc b/third_party/intgemm/benchmarks/benchmark.cc
new file mode 100644
index 0000000000..512d3ec39e
--- /dev/null
+++ b/third_party/intgemm/benchmarks/benchmark.cc
@@ -0,0 +1,214 @@
+#include "../intgemm/aligned.h"
+#include "intgemm/intgemm_config.h"
+#include "../intgemm/avx512_gemm.h"
+#include "../intgemm/sse2_gemm.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/ssse3_gemm.h"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/stats.h"
+#include "../intgemm/callbacks.h"
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <random>
+
+namespace intgemm {
+namespace {
+
+struct RandomMatrices {
+  RandomMatrices(Index A_rows_in, Index width_in, Index B_cols_in) :
+    A_rows(A_rows_in), width(width_in), B_cols(B_cols_in),
+    A(A_rows * width), B(width * B_cols) {
+    std::mt19937 gen;
+    std::uniform_real_distribution<float> dist(-1.f, 1.f);
+    gen.seed(45678);
+
+    for (auto& it : A) {
+      it = dist(gen);
+    }
+    for (auto& it : B) {
+      it = dist(gen);
+    }
+  }
+
+  const Index A_rows, width, B_cols;
+  AlignedVector<float> A, B;
+};
+
+template <class Backend> double Run(const RandomMatrices &m) {
+  using Integer = typename Backend::Integer;
+  float quant_mult = 127.0f / 2.0f;
+  float unquant_mult = 1.0f / (quant_mult * quant_mult);
+  AlignedVector<Integer> A_prepared(m.A_rows * m.width);
+  Backend::PrepareA(m.A.begin(), A_prepared.begin(), quant_mult, m.A_rows, m.width);
+  AlignedVector<Integer> B_prepared(m.width * m.B_cols);
+  Backend::PrepareB(m.B.begin(), B_prepared.begin(), quant_mult, m.width, m.B_cols);
+  AlignedVector<float> output(m.A_rows * m.B_cols);
+  // Burn in
+  Backend::Multiply(A_prepared.begin(), B_prepared.begin(), m.A_rows, m.width, m.B_cols, callbacks::UnquantizeAndWrite(unquant_mult, output.begin()));
+  auto start = std::chrono::steady_clock::now();
+  Backend::Multiply(A_prepared.begin(), B_prepared.begin(), m.A_rows, m.width, m.B_cols, callbacks::UnquantizeAndWrite(unquant_mult, output.begin()));
+  return std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count();
+}
+
+template <class Backend> void RunAll(RandomMatrices *matrices, RandomMatrices *matrices_end, std::vector<std::vector<double>> &stats) {
+  if (Backend::kUses > kCPU) return;
+  std::size_t size = matrices_end - matrices;
+  if (stats.size() < size)
+    stats.resize(size);
+  for (std::size_t i = 0; i < size; ++i) {
+    stats[i].push_back(Run<Backend>(matrices[i]));
+  }
+}
+
+struct BackendStats {
+  std::vector<std::vector<double>> ssse3_8bit;
+  std::vector<std::vector<double>> avx2_8bit;
+  std::vector<std::vector<double>> avx512_8bit;
+  std::vector<std::vector<double>> avx512vnni_8bit;
+  std::vector<std::vector<double>> sse2_16bit;
+  std::vector<std::vector<double>> avx2_16bit;
+  std::vector<std::vector<double>> avx512_16bit;
+};
+
+const float kOutlierThreshold = 0.75;
+void Summarize(std::vector<double> &stats) {
+  // Throw out outliers.
+  std::vector<double>::iterator keep = stats.begin() + static_cast<std::size_t>(static_cast<float>(stats.size()) * kOutlierThreshold);
+  std::nth_element(stats.begin(), keep, stats.end());
+  double avg = 0.0;
+  for (std::vector<double>::const_iterator i = stats.begin(); i != keep; ++i) {
+    avg += *i;
+  }
+  avg /= (keep - stats.begin());
+  double stddev = 0.0;
+  for (std::vector<double>::const_iterator i = stats.begin(); i != keep; ++i) {
+    double off = (double)*i - avg;
+    stddev += off * off;
+  }
+  stddev = sqrt(stddev / (keep - stats.begin() - 1));
+  std::cout << std::setw(10) << *std::min_element(stats.begin(), stats.end()) << '\t' << std::setw(8) << avg << '\t' << std::setw(8) << stddev;
+}
+
+template <class Backend> void Print(std::vector<std::vector<double>> &stats, std::size_t index) {
+  if (stats.empty()) return;
+  std::cout << std::setw(16) << Backend::kName << '\t';
+  Summarize(stats[index]);
+  std::cout << '\n';
+}
+
+} // namespace intgemm
+} // namespace
+
+// Program takes no input
+int main(int, char ** argv) {
+  std::cerr << "Remember to run this on a specific core:\ntaskset --cpu-list 0 " << argv[0] << std::endl;
+
+  using namespace intgemm;
+  RandomMatrices matrices[] = {
+    {1, 64, 8},
+    {8, 256, 256},
+    {8, 2048, 256},
+    {8, 256, 2048},
+    {320, 256, 256},
+    {472, 256, 256},
+    {248, 256, 256},
+    {200, 256, 256},
+    // Additional stuff
+    {256, 256, 256},
+    {512, 512, 512},
+    {1024, 1024, 1024},
+/*    {4096, 4096, 4096},
+    {4096, 4096, 2048},
+    {4096, 4096, 1024},
+    {4096, 4096, 512},
+    {4096, 4096, 256},*/
+    {4096, 4096, 128}
+  };
+  RandomMatrices *matrices_end = (RandomMatrices*)matrices + sizeof(matrices) / sizeof(RandomMatrices);
+  // Only do full sampling for <1024 rows.
+  RandomMatrices *full_sample;
+  for (full_sample = matrices_end - 1; full_sample >= matrices && full_sample->A_rows >= 1024; --full_sample) {}
+  ++full_sample;
+
+  BackendStats stats;
+  const int kSamples = 100;
+  // Realistically, we don't expect different architectures or different precisions to run in the
+  // same run of an application. Benchmark per architecture and per precision level.
+  std::cerr << "SSSE3 8bit, 100 samples..." << std::endl;
+  for (int samples = 0; samples < kSamples; ++samples) {
+    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
+    RunAll<SSSE3::Kernels8>(matrices, end, stats.ssse3_8bit);
+  }
+
+  std::cerr << "SSE2 16bit, 100 samples..." << std::endl;
+  for (int samples = 0; samples < kSamples; ++samples) {
+    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
+    RunAll<SSE2::Kernels16>(matrices, end, stats.sse2_16bit);
+  }
+
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
+  std::cerr << "AVX2 8bit, 100 samples..." << std::endl;
+  for (int samples = 0; samples < kSamples; ++samples) {
+    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
+    RunAll<AVX2::Kernels8>(matrices, end, stats.avx2_8bit);
+  }
+
+  std::cerr << "AVX2 16bit, 100 samples..." << std::endl;
+  for (int samples = 0; samples < kSamples; ++samples) {
+    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
+    RunAll<AVX2::Kernels16>(matrices, end, stats.avx2_16bit);
+  }
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+  std::cerr << "AVX512 8bit, 100 samples..." << std::endl;
+  for (int samples = 0; samples < kSamples; ++samples) {
+    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
+    RunAll<AVX512BW::Kernels8>(matrices, end, stats.avx512_8bit);
+  }
+
+  std::cerr << "AVX512 16bit, 100 samples..." << std::endl;
+  for (int samples = 0; samples < kSamples; ++samples) {
+    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
+    RunAll<AVX512BW::Kernels16>(matrices, end, stats.avx512_16bit);
+  }
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+  std::cerr << "AVX512VNNI 8bit, 100 samples..." << std::endl;
+  for (int samples = 0; samples < kSamples; ++samples) {
+    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
+    RunAll<AVX512VNNI::Kernels8>(matrices, end, stats.avx512vnni_8bit);
+  }
+#endif
+
+  if (stats.sse2_16bit.empty()) {
+    std::cerr << "No CPU support." << std::endl;
+    return 1;
+  }
+  for (std::size_t i = 0; i < sizeof(matrices) / sizeof(RandomMatrices); ++i) {
+    std::cout << "Multiply\t" << matrices[i].A_rows << '\t' << matrices[i].width << '\t' << matrices[i].B_cols << '\t' << "Samples=" << (kOutlierThreshold * stats.sse2_16bit[i].size()) << '\n';
+    Print<SSSE3::Kernels8>(stats.ssse3_8bit, i);
+    Print<AVX2::Kernels8>(stats.avx2_8bit, i);
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+    Print<AVX512BW::Kernels8>(stats.avx512_8bit, i);
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+    Print<AVX512VNNI::Kernels8>(stats.avx512vnni_8bit, i);
+#endif
+    Print<SSE2::Kernels16>(stats.sse2_16bit, i);
+    Print<AVX2::Kernels16>(stats.avx2_16bit, i);
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+    Print<AVX512BW::Kernels16>(stats.avx512_16bit, i);
+#endif
+  }
+  return 0;
+}
+
+
diff --git a/third_party/intgemm/benchmarks/benchmark_quantizer.cc b/third_party/intgemm/benchmarks/benchmark_quantizer.cc
new file mode 100644
index 0000000000..5235b1ea0d
--- /dev/null
+++ b/third_party/intgemm/benchmarks/benchmark_quantizer.cc
@@ -0,0 +1,74 @@
+#include "../intgemm/intgemm.h"
+#include "../intgemm/aligned.h"
+#include "../intgemm/ssse3_gemm.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/avx512_gemm.h"
+
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <vector>
+
+namespace {
+
+float MaxAbsoluteBaseline(const float *begin, const float *end) {
+  auto res = std::minmax_element(begin, end);
+  return std::max(std::fabs(*res.first), std::fabs(*res.second));
+}
+
+void BenchmarkMaxAbsolute() {
+  std::mt19937 gen;
+  std::uniform_real_distribution<float> dist(0.f, 1.f);
+  gen.seed(45678);
+
+  intgemm::AlignedVector<float> v(4096 * 4096);
+  for (auto& it : v) {
+    it = dist(gen);
+  }
+
+  // Hopefully these don't get optimized out...
+  MaxAbsoluteBaseline(v.begin(), v.end());
+  auto start = std::chrono::steady_clock::now();
+  MaxAbsoluteBaseline(v.begin(), v.end());
+  double baseline = std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count();
+  intgemm::MaxAbsolute(v.begin(), v.end());
+  start = std::chrono::steady_clock::now();
+  intgemm::MaxAbsolute(v.begin(), v.end());
+  double optimized = std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count();
+  std::cout << "MaxAbsolute baseline = " << baseline << " optimized = " << optimized << " speedup = " << (optimized / baseline) << '\n';
+}
+
+template <class Backend> void QuantizerBench(const float *in, int8_t *out, intgemm::Index count) {
+  if (intgemm::kCPU < Backend::kUses) return;
+  Backend::Quantize(in, out, 1.0, count);
+  const std::size_t kTries = 60;
+  auto start = std::chrono::steady_clock::now();
+  for (std::size_t t = 0; t < kTries; ++t) {
+    Backend::Quantize(in, out, 1.0, count);
+  }
+  auto end = std::chrono::steady_clock::now();
+  double took = std::chrono::duration<double>(end - start).count() / kTries;
+  std::cout << std::setw(9) << count << ' ' << std::fixed << std::setw(9) << std::setprecision(7) << took << ' ' << Backend::kName << std::endl;
+}
+} // namespace
+
+int main() {
+  BenchmarkMaxAbsolute();
+  for (std::size_t count = 1; count < (1ULL<<30); count *= 2) {
+    intgemm::AlignedVector<float> in(count);
+    intgemm::AlignedVector<int8_t> out(count);
+    std::mt19937 gen;
+    std::uniform_real_distribution<float> dist(-129.0, 129.0);
+    for (float &element : in) {
+      element = dist(gen);
+    }
+    QuantizerBench<intgemm::SSSE3::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
+    QuantizerBench<intgemm::AVX2::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+    QuantizerBench<intgemm::AVX512BW::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+#endif
+  }
+}
diff --git a/third_party/intgemm/benchmarks/biasmultiply.cc b/third_party/intgemm/benchmarks/biasmultiply.cc
new file mode 100644
index 0000000000..c835b61649
--- /dev/null
+++ b/third_party/intgemm/benchmarks/biasmultiply.cc
@@ -0,0 +1,278 @@
+#include "../intgemm/intgemm.h"
+#include "../intgemm/aligned.h"
+#include <chrono>
+#include <random>
+#include <iostream>
+
+using namespace intgemm;
+
+template <class Routine>
+void testOld(Index /*rows*/, Index /*cols*/) {
+}
+
+template <class Routine>
+std::chrono::duration<double> testNew(Index A_rows, Index width, Index B_cols) {
+  AlignedVector<float> A(A_rows * width);
+  AlignedVector<float> B(width * B_cols);
+  AlignedVector<float> bias(B_cols);
+  std::mt19937 gen;
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+  for (auto& it : A) {
+    it = dist(gen);
+  }
+  for (auto& it : B) {
+    it = dist(gen);
+  }
+  for (auto& it : bias) {
+    it = dist(gen);
+  }
+
+  float alpha = 2.0f;
+  float quant_mult = 127.0f / alpha;
+  float unquant_mult = 1.0f / (quant_mult*quant_mult);
+
+  AlignedVector<uint8_t> A_prep(A.size());
+  AlignedVector<int8_t> B_prep(B.size());
+  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
+  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
+
+  AlignedVector<float> test_C(A_rows * B_cols);
+
+  float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
+  Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin()));
+  auto start = std::chrono::system_clock::now();
+  Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
+  auto end = std::chrono::system_clock::now();
+
+  std::chrono::duration<double> elapsed_seconds = end-start;
+  return elapsed_seconds;
+
+}
+
+template <class Routine>
+std::chrono::duration<double> testOld(Index A_rows, Index width, Index B_cols) {
+  AlignedVector<float> A(A_rows * width);
+  AlignedVector<float> B(width * B_cols);
+  AlignedVector<float> bias(B_cols);
+  std::mt19937 gen;
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+  for (auto& it : A) {
+    it = dist(gen);
+  }
+  for (auto& it : B) {
+    it = dist(gen);
+  }
+  for (auto& it : bias) {
+    it = dist(gen);
+  }
+
+  float alpha = 2.0f;
+  float quant_mult = 127.0f / alpha;
+  float unquant_mult = 1.0f / (quant_mult*quant_mult);
+
+  AlignedVector<int8_t> A_prep(A.size());
+  AlignedVector<int8_t> B_prep(B.size());
+  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
+  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
+
+  AlignedVector<float> test_C(A_rows * B_cols);
+
+  auto start = std::chrono::system_clock::now();
+  Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
+  auto end = std::chrono::system_clock::now();
+
+  std::chrono::duration<double> elapsed_seconds = end-start;
+  return elapsed_seconds;
+
+}
+
+template <class Routine>
+std::chrono::duration<double> testOld_nobias(Index A_rows, Index width, Index B_cols) {
+  AlignedVector<float> A(A_rows * width);
+  AlignedVector<float> B(width * B_cols);
+  std::mt19937 gen;
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+  for (auto& it : A) {
+    it = dist(gen);
+  }
+  for (auto& it : B) {
+    it = dist(gen);
+  }
+
+  float alpha = 2.0f;
+  float quant_mult = 127.0f / alpha;
+  float unquant_mult = 1.0f / (quant_mult*quant_mult);
+
+  AlignedVector<int8_t> A_prep(A.size());
+  AlignedVector<int8_t> B_prep(B.size());
+  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
+  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
+
+  AlignedVector<float> test_C(A_rows * B_cols);
+
+  auto start = std::chrono::system_clock::now();
+  Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndWrite(unquant_mult, test_C.begin()));
+  auto end = std::chrono::system_clock::now();
+
+  std::chrono::duration<double> elapsed_seconds = end-start;
+  return elapsed_seconds;
+
+}
+
+int main(int argc, char ** argv) {
+	int repeat = 1000;
+	if (argc > 1) {
+		repeat = atoi(argv[1]);
+	}
+
+	std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<SSSE3::Kernels8>(1, 64, 8);
+	for (int i = 0; i<repeat; i++) {
+		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(8, 256, 256);
+		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(8, 2048, 256);
+		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(320, 256, 256);
+		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(472, 256, 256);
+		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(248, 256, 256);
+		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(200, 256, 256);
+	}
+
+	std::cout << repeat << " iterations of SSSE3 without bias took: " << oldSSSE3_nobias.count() << " seconds." << std::endl;
+
+	std::chrono::duration<double> oldSSSE3 = testOld<SSSE3::Kernels8>(1, 64, 8);
+	for (int i = 0; i<repeat; i++) {
+		oldSSSE3 += testOld<SSSE3::Kernels8>(8, 256, 256);
+		oldSSSE3 += testOld<SSSE3::Kernels8>(8, 2048, 256);
+		oldSSSE3 += testOld<SSSE3::Kernels8>(320, 256, 256);
+		oldSSSE3 += testOld<SSSE3::Kernels8>(472, 256, 256);
+		oldSSSE3 += testOld<SSSE3::Kernels8>(248, 256, 256);
+		oldSSSE3 += testOld<SSSE3::Kernels8>(200, 256, 256);
+	}
+
+	std::cout << repeat << " iterations of SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl;
+
+	std::chrono::duration<double> newTimeSSSE3 = testOld<SSSE3::Kernels8>(1, 64, 8);
+	for (int i = 0; i<repeat; i++) {
+		newTimeSSSE3 += testNew<SSSE3::Kernels8>(8, 256, 256);
+		newTimeSSSE3 += testNew<SSSE3::Kernels8>(8, 2048, 256);
+		newTimeSSSE3 += testNew<SSSE3::Kernels8>(320, 256, 256);
+		newTimeSSSE3 += testNew<SSSE3::Kernels8>(472, 256, 256);
+		newTimeSSSE3 += testNew<SSSE3::Kernels8>(248, 256, 256);
+		newTimeSSSE3 += testNew<SSSE3::Kernels8>(200, 256, 256);
+	}
+
+	std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl;
+
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
+	std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<AVX2::Kernels8>(1, 64, 8);
+	for (int i = 0; i<repeat; i++) {
+		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(8, 256, 256);
+		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(8, 2048, 256);
+		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(320, 256, 256);
+		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(472, 256, 256);
+		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(248, 256, 256);
+		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(200, 256, 256);
+	}
+
+	std::cout << repeat << " iterations of AVX2 without bias took: " << oldAVX2_nobias.count() << " seconds." << std::endl;
+
+	std::chrono::duration<double> oldAVX2 = testOld<AVX2::Kernels8>(1, 64, 8);
+	for (int i = 0; i<repeat; i++) {
+		oldAVX2 += testOld<AVX2::Kernels8>(8, 256, 256);
+		oldAVX2 += testOld<AVX2::Kernels8>(8, 2048, 256);
+		oldAVX2 += testOld<AVX2::Kernels8>(320, 256, 256);
+		oldAVX2 += testOld<AVX2::Kernels8>(472, 256, 256);
+		oldAVX2 += testOld<AVX2::Kernels8>(248, 256, 256);
+		oldAVX2 += testOld<AVX2::Kernels8>(200, 256, 256);
+	}
+
+	std::cout << repeat << " iterations of AVX2 took: " << oldAVX2.count() << " seconds." << std::endl;
+
+	std::chrono::duration<double> newTimeAVX2 = testOld<AVX2::Kernels8>(1, 64, 8);
+	for (int i = 0; i<repeat; i++) {
+		newTimeAVX2 += testNew<AVX2::Kernels8>(8, 256, 256);
+		newTimeAVX2 += testNew<AVX2::Kernels8>(8, 2048, 256);
+		newTimeAVX2 += testNew<AVX2::Kernels8>(320, 256, 256);
+		newTimeAVX2 += testNew<AVX2::Kernels8>(472, 256, 256);
+		newTimeAVX2 += testNew<AVX2::Kernels8>(248, 256, 256);
+		newTimeAVX2 += testNew<AVX2::Kernels8>(200, 256, 256);
+	}
+
+	std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl;
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+	if (kCPU < CPUType::AVX512BW) return 0;
+	std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<AVX512BW::Kernels8>(1, 64, 8);
+	for (int i = 0; i<repeat; i++) {
+		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(8, 256, 256);
+		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(8, 2048, 256);
+		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(320, 256, 256);
+		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(472, 256, 256);
+		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(248, 256, 256);
+		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(200, 256, 256);
+	}
+
+	std::cout << repeat << " iterations of AVX512 without bias took: " << oldAVX512_nobias.count() << " seconds." << std::endl;
+
+	std::chrono::duration<double> oldAVX512 = testOld<AVX512BW::Kernels8>(1, 64, 8);
+	for (int i = 0; i<repeat; i++) {
+		oldAVX512 += testOld<AVX512BW::Kernels8>(8, 256, 256);
+		oldAVX512 += testOld<AVX512BW::Kernels8>(8, 2048, 256);
+		oldAVX512 += testOld<AVX512BW::Kernels8>(320, 256, 256);
+		oldAVX512 += testOld<AVX512BW::Kernels8>(472, 256, 256);
+		oldAVX512 += testOld<AVX512BW::Kernels8>(248, 256, 256);
+		oldAVX512 += testOld<AVX512BW::Kernels8>(200, 256, 256);
+	}
+
+	std::cout << repeat << " iterations of AVX512 took: " << oldAVX512.count() << " seconds." << std::endl;
+
+	std::chrono::duration<double> newTimeAVX512 = testOld<AVX512BW::Kernels8>(1, 64, 8);
+	for (int i = 0; i<repeat; i++) {
+		newTimeAVX512 += testNew<AVX512BW::Kernels8>(8, 256, 256);
+		newTimeAVX512 += testNew<AVX512BW::Kernels8>(8, 2048, 256);
+		newTimeAVX512 += testNew<AVX512BW::Kernels8>(320, 256, 256);
+		newTimeAVX512 += testNew<AVX512BW::Kernels8>(472, 256, 256);
+		newTimeAVX512 += testNew<AVX512BW::Kernels8>(248, 256, 256);
+		newTimeAVX512 += testNew<AVX512BW::Kernels8>(200, 256, 256);
+	}
+
+	std::cout << repeat << " iterations of Shifted AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl;
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+  if (kCPU < CPUType::AVX512VNNI) return 0;
+  std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<AVX512BW::Kernels8>(1, 64, 8);
+  for (int i = 0; i<repeat; i++) {
+          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(8, 256, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(8, 2048, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(320, 256, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(472, 256, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(248, 256, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(200, 256, 256);
+  }
+
+  std::cout << repeat << " iterations of AVX512VNNI without bias took: " << oldAVX512VNNI_nobias.count() << " seconds." << std::endl;
+
+  std::chrono::duration<double> oldAVX512VNNI = testOld<AVX512BW::Kernels8>(1, 64, 8);
+  for (int i = 0; i<repeat; i++) {
+          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(8, 256, 256);
+          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(8, 2048, 256);
+          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(320, 256, 256);
+          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(472, 256, 256);
+          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(248, 256, 256);
+          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(200, 256, 256);
+  }
+
+  std::cout << repeat << " iterations of AVX512VNNI took: " << oldAVX512VNNI.count() << " seconds." << std::endl;
+
+  std::chrono::duration<double> newTimeAVX512VNNI = testOld<AVX512BW::Kernels8>(1, 64, 8);
+  for (int i = 0; i<repeat; i++) {
+    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(8, 256, 256);
+    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(8, 2048, 256);
+    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(320, 256, 256);
+    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(472, 256, 256);
+    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(248, 256, 256);
+    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(200, 256, 256);
+  }
+
+  std::cout << repeat << " iterations of Shifted AVX512VNNI took: " << newTimeAVX512VNNI.count() << " seconds." << std::endl;
+#endif
+
+}