summaryrefslogtreecommitdiffstats
path: root/lib/crypto_backend/argon2/blake2
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 17:44:12 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 17:44:12 +0000
commit1be69c2c660b70ac2f4de2a5326e27e3e60eb82d (patch)
treebb299ab6f411f4fccd735907035de710e4ec6abc /lib/crypto_backend/argon2/blake2
parentInitial commit. (diff)
downloadcryptsetup-9eddc734c478e50782ad0d1e21fe2d3218ed3213.tar.xz
cryptsetup-9eddc734c478e50782ad0d1e21fe2d3218ed3213.zip
Adding upstream version 2:2.3.7.upstream/2%2.3.7upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lib/crypto_backend/argon2/blake2')
-rw-r--r--lib/crypto_backend/argon2/blake2/blake2-impl.h154
-rw-r--r--lib/crypto_backend/argon2/blake2/blake2.h89
-rw-r--r--lib/crypto_backend/argon2/blake2/blake2b.c392
-rw-r--r--lib/crypto_backend/argon2/blake2/blamka-round-opt.h471
-rw-r--r--lib/crypto_backend/argon2/blake2/blamka-round-ref.h56
5 files changed, 1162 insertions, 0 deletions
diff --git a/lib/crypto_backend/argon2/blake2/blake2-impl.h b/lib/crypto_backend/argon2/blake2/blake2-impl.h
new file mode 100644
index 0000000..dcac827
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blake2-impl.h
@@ -0,0 +1,154 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef PORTABLE_BLAKE2_IMPL_H
+#define PORTABLE_BLAKE2_IMPL_H
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(_MSC_VER)
+#define BLAKE2_INLINE __inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define BLAKE2_INLINE __inline__
+#else
+#define BLAKE2_INLINE
+#endif
+
+/* Argon2 Team - Begin Code */
+/*
+ Not an exhaustive list, but should cover the majority of modern platforms
+ Additionally, the code will always be correct---this is only a performance
+ tweak.
+*/
+#if (defined(__BYTE_ORDER__) && \
+ (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+ defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \
+ defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \
+ defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \
+ defined(_M_ARM)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+/* Argon2 Team - End Code */
+
+static BLAKE2_INLINE uint32_t load32(const void *src) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+ uint32_t w;
+ memcpy(&w, src, sizeof w);
+ return w;
+#else
+ const uint8_t *p = (const uint8_t *)src;
+ uint32_t w = *p++;
+ w |= (uint32_t)(*p++) << 8;
+ w |= (uint32_t)(*p++) << 16;
+ w |= (uint32_t)(*p++) << 24;
+ return w;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load64(const void *src) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+ uint64_t w;
+ memcpy(&w, src, sizeof w);
+ return w;
+#else
+ const uint8_t *p = (const uint8_t *)src;
+ uint64_t w = *p++;
+ w |= (uint64_t)(*p++) << 8;
+ w |= (uint64_t)(*p++) << 16;
+ w |= (uint64_t)(*p++) << 24;
+ w |= (uint64_t)(*p++) << 32;
+ w |= (uint64_t)(*p++) << 40;
+ w |= (uint64_t)(*p++) << 48;
+ w |= (uint64_t)(*p++) << 56;
+ return w;
+#endif
+}
+
+static BLAKE2_INLINE void store32(void *dst, uint32_t w) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+ memcpy(dst, &w, sizeof w);
+#else
+ uint8_t *p = (uint8_t *)dst;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+#endif
+}
+
+static BLAKE2_INLINE void store64(void *dst, uint64_t w) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+ memcpy(dst, &w, sizeof w);
+#else
+ uint8_t *p = (uint8_t *)dst;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load48(const void *src) {
+ const uint8_t *p = (const uint8_t *)src;
+ uint64_t w = *p++;
+ w |= (uint64_t)(*p++) << 8;
+ w |= (uint64_t)(*p++) << 16;
+ w |= (uint64_t)(*p++) << 24;
+ w |= (uint64_t)(*p++) << 32;
+ w |= (uint64_t)(*p++) << 40;
+ return w;
+}
+
+static BLAKE2_INLINE void store48(void *dst, uint64_t w) {
+ uint8_t *p = (uint8_t *)dst;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+ w >>= 8;
+ *p++ = (uint8_t)w;
+}
+
+static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) {
+ return (w >> c) | (w << (32 - c));
+}
+
+static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {
+ return (w >> c) | (w << (64 - c));
+}
+
+#endif
diff --git a/lib/crypto_backend/argon2/blake2/blake2.h b/lib/crypto_backend/argon2/blake2/blake2.h
new file mode 100644
index 0000000..0c1b0ee
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blake2.h
@@ -0,0 +1,89 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef PORTABLE_BLAKE2_H
+#define PORTABLE_BLAKE2_H
+
+#include "../argon2.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+enum blake2b_constant {
+ BLAKE2B_BLOCKBYTES = 128,
+ BLAKE2B_OUTBYTES = 64,
+ BLAKE2B_KEYBYTES = 64,
+ BLAKE2B_SALTBYTES = 16,
+ BLAKE2B_PERSONALBYTES = 16
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2b_param {
+ uint8_t digest_length; /* 1 */
+ uint8_t key_length; /* 2 */
+ uint8_t fanout; /* 3 */
+ uint8_t depth; /* 4 */
+ uint32_t leaf_length; /* 8 */
+ uint64_t node_offset; /* 16 */
+ uint8_t node_depth; /* 17 */
+ uint8_t inner_length; /* 18 */
+ uint8_t reserved[14]; /* 32 */
+ uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */
+ uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
+} blake2b_param;
+#pragma pack(pop)
+
+typedef struct __blake2b_state {
+ uint64_t h[8];
+ uint64_t t[2];
+ uint64_t f[2];
+ uint8_t buf[BLAKE2B_BLOCKBYTES];
+ unsigned buflen;
+ unsigned outlen;
+ uint8_t last_node;
+} blake2b_state;
+
+/* Ensure param structs have not been wrongly padded */
+/* Poor man's static_assert */
+enum {
+ blake2_size_check_0 = 1 / !!(CHAR_BIT == 8),
+ blake2_size_check_2 =
+ 1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT)
+};
+
+/* Streaming API */
+ARGON2_LOCAL int blake2b_init(blake2b_state *S, size_t outlen);
+ARGON2_LOCAL int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
+ size_t keylen);
+ARGON2_LOCAL int blake2b_init_param(blake2b_state *S, const blake2b_param *P);
+ARGON2_LOCAL int blake2b_update(blake2b_state *S, const void *in, size_t inlen);
+ARGON2_LOCAL int blake2b_final(blake2b_state *S, void *out, size_t outlen);
+
+/* Simple API */
+ARGON2_LOCAL int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
+ const void *key, size_t keylen);
+
+/* Argon2 Team - Begin Code */
+ARGON2_LOCAL int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
+/* Argon2 Team - End Code */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/lib/crypto_backend/argon2/blake2/blake2b.c b/lib/crypto_backend/argon2/blake2/blake2b.c
new file mode 100644
index 0000000..d8f69e8
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blake2b.c
@@ -0,0 +1,392 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+void clear_internal_memory(void *v, size_t n);
+
+static const uint64_t blake2b_IV[8] = {
+ UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
+ UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
+ UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
+ UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179)};
+
+static const unsigned int blake2b_sigma[12][16] = {
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+ {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+ {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+ {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+ {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+ {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+ {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+ {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+ {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+};
+
+static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) {
+ S->f[1] = (uint64_t)-1;
+}
+
+static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) {
+ if (S->last_node) {
+ blake2b_set_lastnode(S);
+ }
+ S->f[0] = (uint64_t)-1;
+}
+
+static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S,
+ uint64_t inc) {
+ S->t[0] += inc;
+ S->t[1] += (S->t[0] < inc);
+}
+
+static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) {
+ clear_internal_memory(S, sizeof(*S)); /* wipe */
+ blake2b_set_lastblock(S); /* invalidate for further use */
+}
+
+static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) {
+ memset(S, 0, sizeof(*S));
+ memcpy(S->h, blake2b_IV, sizeof(S->h));
+}
+
+int blake2b_init_param(blake2b_state *S, const blake2b_param *P) {
+ const unsigned char *p = (const unsigned char *)P;
+ unsigned int i;
+
+ if (NULL == P || NULL == S) {
+ return -1;
+ }
+
+ blake2b_init0(S);
+ /* IV XOR Parameter Block */
+ for (i = 0; i < 8; ++i) {
+ S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
+ }
+ S->outlen = P->digest_length;
+ return 0;
+}
+
+/* Sequential blake2b initialization */
+int blake2b_init(blake2b_state *S, size_t outlen) {
+ blake2b_param P;
+
+ if (S == NULL) {
+ return -1;
+ }
+
+ if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) {
+ blake2b_invalidate_state(S);
+ return -1;
+ }
+
+ /* Setup Parameter Block for unkeyed BLAKE2 */
+ P.digest_length = (uint8_t)outlen;
+ P.key_length = 0;
+ P.fanout = 1;
+ P.depth = 1;
+ P.leaf_length = 0;
+ P.node_offset = 0;
+ P.node_depth = 0;
+ P.inner_length = 0;
+ memset(P.reserved, 0, sizeof(P.reserved));
+ memset(P.salt, 0, sizeof(P.salt));
+ memset(P.personal, 0, sizeof(P.personal));
+
+ return blake2b_init_param(S, &P);
+}
+
+int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
+ size_t keylen) {
+ blake2b_param P;
+
+ if (S == NULL) {
+ return -1;
+ }
+
+ if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) {
+ blake2b_invalidate_state(S);
+ return -1;
+ }
+
+ if ((key == 0) || (keylen == 0) || (keylen > BLAKE2B_KEYBYTES)) {
+ blake2b_invalidate_state(S);
+ return -1;
+ }
+
+ /* Setup Parameter Block for keyed BLAKE2 */
+ P.digest_length = (uint8_t)outlen;
+ P.key_length = (uint8_t)keylen;
+ P.fanout = 1;
+ P.depth = 1;
+ P.leaf_length = 0;
+ P.node_offset = 0;
+ P.node_depth = 0;
+ P.inner_length = 0;
+ memset(P.reserved, 0, sizeof(P.reserved));
+ memset(P.salt, 0, sizeof(P.salt));
+ memset(P.personal, 0, sizeof(P.personal));
+
+ if (blake2b_init_param(S, &P) < 0) {
+ blake2b_invalidate_state(S);
+ return -1;
+ }
+
+ {
+ uint8_t block[BLAKE2B_BLOCKBYTES];
+ memset(block, 0, BLAKE2B_BLOCKBYTES);
+ memcpy(block, key, keylen);
+ blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
+ /* Burn the key from stack */
+ clear_internal_memory(block, BLAKE2B_BLOCKBYTES);
+ }
+ return 0;
+}
+
+static void blake2b_compress(blake2b_state *S, const uint8_t *block) {
+ uint64_t m[16];
+ uint64_t v[16];
+ unsigned int i, r;
+
+ for (i = 0; i < 16; ++i) {
+ m[i] = load64(block + i * sizeof(m[i]));
+ }
+
+ for (i = 0; i < 8; ++i) {
+ v[i] = S->h[i];
+ }
+
+ v[8] = blake2b_IV[0];
+ v[9] = blake2b_IV[1];
+ v[10] = blake2b_IV[2];
+ v[11] = blake2b_IV[3];
+ v[12] = blake2b_IV[4] ^ S->t[0];
+ v[13] = blake2b_IV[5] ^ S->t[1];
+ v[14] = blake2b_IV[6] ^ S->f[0];
+ v[15] = blake2b_IV[7] ^ S->f[1];
+
+#define G(r, i, a, b, c, d) \
+ do { \
+ a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
+ d = rotr64(d ^ a, 32); \
+ c = c + d; \
+ b = rotr64(b ^ c, 24); \
+ a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \
+ d = rotr64(d ^ a, 16); \
+ c = c + d; \
+ b = rotr64(b ^ c, 63); \
+ } while ((void)0, 0)
+
+#define ROUND(r) \
+ do { \
+ G(r, 0, v[0], v[4], v[8], v[12]); \
+ G(r, 1, v[1], v[5], v[9], v[13]); \
+ G(r, 2, v[2], v[6], v[10], v[14]); \
+ G(r, 3, v[3], v[7], v[11], v[15]); \
+ G(r, 4, v[0], v[5], v[10], v[15]); \
+ G(r, 5, v[1], v[6], v[11], v[12]); \
+ G(r, 6, v[2], v[7], v[8], v[13]); \
+ G(r, 7, v[3], v[4], v[9], v[14]); \
+ } while ((void)0, 0)
+
+ for (r = 0; r < 12; ++r) {
+ ROUND(r);
+ }
+
+ for (i = 0; i < 8; ++i) {
+ S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+ }
+
+#undef G
+#undef ROUND
+}
+
+int blake2b_update(blake2b_state *S, const void *in, size_t inlen) {
+ const uint8_t *pin = (const uint8_t *)in;
+
+ if (inlen == 0) {
+ return 0;
+ }
+
+ /* Sanity check */
+ if (S == NULL || in == NULL) {
+ return -1;
+ }
+
+ /* Is this a reused state? */
+ if (S->f[0] != 0) {
+ return -1;
+ }
+
+ if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) {
+ /* Complete current block */
+ size_t left = S->buflen;
+ size_t fill = BLAKE2B_BLOCKBYTES - left;
+ memcpy(&S->buf[left], pin, fill);
+ blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+ blake2b_compress(S, S->buf);
+ S->buflen = 0;
+ inlen -= fill;
+ pin += fill;
+ /* Avoid buffer copies when possible */
+ while (inlen > BLAKE2B_BLOCKBYTES) {
+ blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+ blake2b_compress(S, pin);
+ inlen -= BLAKE2B_BLOCKBYTES;
+ pin += BLAKE2B_BLOCKBYTES;
+ }
+ }
+ memcpy(&S->buf[S->buflen], pin, inlen);
+ S->buflen += (unsigned int)inlen;
+ return 0;
+}
+
+int blake2b_final(blake2b_state *S, void *out, size_t outlen) {
+ uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
+ unsigned int i;
+
+ /* Sanity checks */
+ if (S == NULL || out == NULL || outlen < S->outlen) {
+ return -1;
+ }
+
+ /* Is this a reused state? */
+ if (S->f[0] != 0) {
+ return -1;
+ }
+
+ blake2b_increment_counter(S, S->buflen);
+ blake2b_set_lastblock(S);
+ memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
+ blake2b_compress(S, S->buf);
+
+ for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
+ store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
+ }
+
+ memcpy(out, buffer, S->outlen);
+ clear_internal_memory(buffer, sizeof(buffer));
+ clear_internal_memory(S->buf, sizeof(S->buf));
+ clear_internal_memory(S->h, sizeof(S->h));
+ return 0;
+}
+
+int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
+ const void *key, size_t keylen) {
+ blake2b_state S;
+ int ret = -1;
+
+ /* Verify parameters */
+ if (NULL == in && inlen > 0) {
+ goto fail;
+ }
+
+ if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) {
+ goto fail;
+ }
+
+ if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) {
+ goto fail;
+ }
+
+ if (keylen > 0) {
+ if (blake2b_init_key(&S, outlen, key, keylen) < 0) {
+ goto fail;
+ }
+ } else {
+ if (blake2b_init(&S, outlen) < 0) {
+ goto fail;
+ }
+ }
+
+ if (blake2b_update(&S, in, inlen) < 0) {
+ goto fail;
+ }
+ ret = blake2b_final(&S, out, outlen);
+
+fail:
+ clear_internal_memory(&S, sizeof(S));
+ return ret;
+}
+
+/* Argon2 Team - Begin Code */
+int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
+ uint8_t *out = (uint8_t *)pout;
+ blake2b_state blake_state;
+ uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
+ int ret = -1;
+
+ if (outlen > UINT32_MAX) {
+ goto fail;
+ }
+
+ /* Ensure little-endian byte order! */
+ store32(outlen_bytes, (uint32_t)outlen);
+
+#define TRY(statement) \
+ do { \
+ ret = statement; \
+ if (ret < 0) { \
+ goto fail; \
+ } \
+ } while ((void)0, 0)
+
+ if (outlen <= BLAKE2B_OUTBYTES) {
+ TRY(blake2b_init(&blake_state, outlen));
+ TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+ TRY(blake2b_update(&blake_state, in, inlen));
+ TRY(blake2b_final(&blake_state, out, outlen));
+ } else {
+ uint32_t toproduce;
+ uint8_t out_buffer[BLAKE2B_OUTBYTES];
+ uint8_t in_buffer[BLAKE2B_OUTBYTES];
+ TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
+ TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+ TRY(blake2b_update(&blake_state, in, inlen));
+ TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
+ memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+ out += BLAKE2B_OUTBYTES / 2;
+ toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
+
+ while (toproduce > BLAKE2B_OUTBYTES) {
+ memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+ TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
+ BLAKE2B_OUTBYTES, NULL, 0));
+ memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+ out += BLAKE2B_OUTBYTES / 2;
+ toproduce -= BLAKE2B_OUTBYTES / 2;
+ }
+
+ memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+ TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
+ 0));
+ memcpy(out, out_buffer, toproduce);
+ }
+fail:
+ clear_internal_memory(&blake_state, sizeof(blake_state));
+ return ret;
+#undef TRY
+}
+/* Argon2 Team - End Code */
diff --git a/lib/crypto_backend/argon2/blake2/blamka-round-opt.h b/lib/crypto_backend/argon2/blake2/blamka-round-opt.h
new file mode 100644
index 0000000..3127f2a
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blamka-round-opt.h
@@ -0,0 +1,471 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef BLAKE_ROUND_MKA_OPT_H
+#define BLAKE_ROUND_MKA_OPT_H
+
+#include "blake2-impl.h"
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
+#endif
+
+#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
+#include <x86intrin.h>
+#endif
+
+#if !defined(__AVX512F__)
+#if !defined(__AVX2__)
+#if !defined(__XOP__)
+#if defined(__SSSE3__)
+#define r16 \
+ (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define r24 \
+ (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define _mm_roti_epi64(x, c) \
+ (-(c) == 32) \
+ ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
+ : (-(c) == 24) \
+ ? _mm_shuffle_epi8((x), r24) \
+ : (-(c) == 16) \
+ ? _mm_shuffle_epi8((x), r16) \
+ : (-(c) == 63) \
+ ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
+ _mm_add_epi64((x), (x))) \
+ : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
+ _mm_slli_epi64((x), 64 - (-(c))))
+#else /* defined(__SSE2__) */
+#define _mm_roti_epi64(r, c) \
+ _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
+#endif
+#else
+#endif
+
+static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
+ const __m128i z = _mm_mul_epu32(x, y);
+ return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ A0 = fBlaMka(A0, B0); \
+ A1 = fBlaMka(A1, B1); \
+ \
+ D0 = _mm_xor_si128(D0, A0); \
+ D1 = _mm_xor_si128(D1, A1); \
+ \
+ D0 = _mm_roti_epi64(D0, -32); \
+ D1 = _mm_roti_epi64(D1, -32); \
+ \
+ C0 = fBlaMka(C0, D0); \
+ C1 = fBlaMka(C1, D1); \
+ \
+ B0 = _mm_xor_si128(B0, C0); \
+ B1 = _mm_xor_si128(B1, C1); \
+ \
+ B0 = _mm_roti_epi64(B0, -24); \
+ B1 = _mm_roti_epi64(B1, -24); \
+ } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ A0 = fBlaMka(A0, B0); \
+ A1 = fBlaMka(A1, B1); \
+ \
+ D0 = _mm_xor_si128(D0, A0); \
+ D1 = _mm_xor_si128(D1, A1); \
+ \
+ D0 = _mm_roti_epi64(D0, -16); \
+ D1 = _mm_roti_epi64(D1, -16); \
+ \
+ C0 = fBlaMka(C0, D0); \
+ C1 = fBlaMka(C1, D1); \
+ \
+ B0 = _mm_xor_si128(B0, C0); \
+ B1 = _mm_xor_si128(B1, C1); \
+ \
+ B0 = _mm_roti_epi64(B0, -63); \
+ B1 = _mm_roti_epi64(B1, -63); \
+ } while ((void)0, 0)
+
+#if defined(__SSSE3__)
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \
+ __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \
+ B0 = t0; \
+ B1 = t1; \
+ \
+ t0 = C0; \
+ C0 = C1; \
+ C1 = t0; \
+ \
+ t0 = _mm_alignr_epi8(D1, D0, 8); \
+ t1 = _mm_alignr_epi8(D0, D1, 8); \
+ D0 = t1; \
+ D1 = t0; \
+ } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \
+ __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \
+ B0 = t0; \
+ B1 = t1; \
+ \
+ t0 = C0; \
+ C0 = C1; \
+ C1 = t0; \
+ \
+ t0 = _mm_alignr_epi8(D0, D1, 8); \
+ t1 = _mm_alignr_epi8(D1, D0, 8); \
+ D0 = t1; \
+ D1 = t0; \
+ } while ((void)0, 0)
+#else /* SSE2 */
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ __m128i t0 = D0; \
+ __m128i t1 = B0; \
+ D0 = C0; \
+ C0 = C1; \
+ C1 = D0; \
+ D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \
+ D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \
+ B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \
+ B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \
+ } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ __m128i t0, t1; \
+ t0 = C0; \
+ C0 = C1; \
+ C1 = t0; \
+ t0 = B0; \
+ t1 = D0; \
+ B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \
+ B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \
+ D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \
+ D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \
+ } while ((void)0, 0)
+#endif
+
+#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
+ do { \
+ G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+ G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+ \
+ DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+ \
+ G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+ G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+ \
+ UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+ } while ((void)0, 0)
+#else /* __AVX2__ */
+
+#include <immintrin.h>
+
+#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+
+#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ do { \
+ __m256i ml = _mm256_mul_epu32(A0, B0); \
+ ml = _mm256_add_epi64(ml, ml); \
+ A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+ D0 = _mm256_xor_si256(D0, A0); \
+ D0 = rotr32(D0); \
+ \
+ ml = _mm256_mul_epu32(C0, D0); \
+ ml = _mm256_add_epi64(ml, ml); \
+ C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+ \
+ B0 = _mm256_xor_si256(B0, C0); \
+ B0 = rotr24(B0); \
+ \
+ ml = _mm256_mul_epu32(A1, B1); \
+ ml = _mm256_add_epi64(ml, ml); \
+ A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+ D1 = _mm256_xor_si256(D1, A1); \
+ D1 = rotr32(D1); \
+ \
+ ml = _mm256_mul_epu32(C1, D1); \
+ ml = _mm256_add_epi64(ml, ml); \
+ C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+ \
+ B1 = _mm256_xor_si256(B1, C1); \
+ B1 = rotr24(B1); \
+ } while((void)0, 0);
+
+#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ do { \
+ __m256i ml = _mm256_mul_epu32(A0, B0); \
+ ml = _mm256_add_epi64(ml, ml); \
+ A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+ D0 = _mm256_xor_si256(D0, A0); \
+ D0 = rotr16(D0); \
+ \
+ ml = _mm256_mul_epu32(C0, D0); \
+ ml = _mm256_add_epi64(ml, ml); \
+ C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+ B0 = _mm256_xor_si256(B0, C0); \
+ B0 = rotr63(B0); \
+ \
+ ml = _mm256_mul_epu32(A1, B1); \
+ ml = _mm256_add_epi64(ml, ml); \
+ A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+ D1 = _mm256_xor_si256(D1, A1); \
+ D1 = rotr16(D1); \
+ \
+ ml = _mm256_mul_epu32(C1, D1); \
+ ml = _mm256_add_epi64(ml, ml); \
+ C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+ B1 = _mm256_xor_si256(B1, C1); \
+ B1 = rotr63(B1); \
+ } while((void)0, 0);
+
+#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+ C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+ D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+ \
+ B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+ C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+ D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+ } while((void)0, 0);
+
+#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ do { \
+ __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+ __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+ B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+ B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+ \
+ tmp1 = C0; \
+ C0 = C1; \
+ C1 = tmp1; \
+ \
+ tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
+ tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+ D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+ D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+ } while(0);
+
+#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+ C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+ D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+ \
+ B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+ C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+ D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+ } while((void)0, 0);
+
+#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ do { \
+ __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+ __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+ B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+ B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+ \
+ tmp1 = C0; \
+ C0 = C1; \
+ C1 = tmp1; \
+ \
+ tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
+ tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
+ D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+ D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+ } while((void)0, 0);
+
+#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
+ do{ \
+ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ \
+ DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+ \
+ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ \
+ UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+ } while((void)0, 0);
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ do{ \
+ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ \
+ DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ \
+ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ \
+ UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ } while((void)0, 0);
+
+#endif /* __AVX2__ */
+
+#else /* __AVX512F__ */
+
+#include <immintrin.h>
+
+#define ror64(x, n) _mm512_ror_epi64((x), (n))
+
+static __m512i muladd(__m512i x, __m512i y)
+{
+ __m512i z = _mm512_mul_epu32(x, y);
+ return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ A0 = muladd(A0, B0); \
+ A1 = muladd(A1, B1); \
+\
+ D0 = _mm512_xor_si512(D0, A0); \
+ D1 = _mm512_xor_si512(D1, A1); \
+\
+ D0 = ror64(D0, 32); \
+ D1 = ror64(D1, 32); \
+\
+ C0 = muladd(C0, D0); \
+ C1 = muladd(C1, D1); \
+\
+ B0 = _mm512_xor_si512(B0, C0); \
+ B1 = _mm512_xor_si512(B1, C1); \
+\
+ B0 = ror64(B0, 24); \
+ B1 = ror64(B1, 24); \
+ } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ A0 = muladd(A0, B0); \
+ A1 = muladd(A1, B1); \
+\
+ D0 = _mm512_xor_si512(D0, A0); \
+ D1 = _mm512_xor_si512(D1, A1); \
+\
+ D0 = ror64(D0, 16); \
+ D1 = ror64(D1, 16); \
+\
+ C0 = muladd(C0, D0); \
+ C1 = muladd(C1, D1); \
+\
+ B0 = _mm512_xor_si512(B0, C0); \
+ B1 = _mm512_xor_si512(B1, C1); \
+\
+ B0 = ror64(B0, 63); \
+ B1 = ror64(B1, 63); \
+ } while ((void)0, 0)
+
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+ B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+\
+ C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+ C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+ D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+ D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+ } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+ B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+\
+ C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+ C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+ D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+ D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+ } while ((void)0, 0)
+
+#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
+ do { \
+ G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+ G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+ DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+ G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+ G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+ UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+ } while ((void)0, 0)
+
+#define SWAP_HALVES(A0, A1) \
+ do { \
+ __m512i t0, t1; \
+ t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
+ t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
+ A0 = t0; \
+ A1 = t1; \
+ } while((void)0, 0)
+
+#define SWAP_QUARTERS(A0, A1) \
+ do { \
+ SWAP_HALVES(A0, A1); \
+ A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+ A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+ } while((void)0, 0)
+
+#define UNSWAP_QUARTERS(A0, A1) \
+ do { \
+ A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+ A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+ SWAP_HALVES(A0, A1); \
+ } while((void)0, 0)
+
+#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
+ do { \
+ SWAP_HALVES(A0, B0); \
+ SWAP_HALVES(C0, D0); \
+ SWAP_HALVES(A1, B1); \
+ SWAP_HALVES(C1, D1); \
+ BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+ SWAP_HALVES(A0, B0); \
+ SWAP_HALVES(C0, D0); \
+ SWAP_HALVES(A1, B1); \
+ SWAP_HALVES(C1, D1); \
+ } while ((void)0, 0)
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+ do { \
+ SWAP_QUARTERS(A0, A1); \
+ SWAP_QUARTERS(B0, B1); \
+ SWAP_QUARTERS(C0, C1); \
+ SWAP_QUARTERS(D0, D1); \
+ BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+ UNSWAP_QUARTERS(A0, A1); \
+ UNSWAP_QUARTERS(B0, B1); \
+ UNSWAP_QUARTERS(C0, C1); \
+ UNSWAP_QUARTERS(D0, D1); \
+ } while ((void)0, 0)
+
+#endif /* __AVX512F__ */
+#endif /* BLAKE_ROUND_MKA_OPT_H */
diff --git a/lib/crypto_backend/argon2/blake2/blamka-round-ref.h b/lib/crypto_backend/argon2/blake2/blamka-round-ref.h
new file mode 100644
index 0000000..16cfc1c
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blamka-round-ref.h
@@ -0,0 +1,56 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef BLAKE_ROUND_MKA_H
+#define BLAKE_ROUND_MKA_H
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+/* designed by the Lyra PHC team */
+static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
+ const uint64_t m = UINT64_C(0xFFFFFFFF);
+ const uint64_t xy = (x & m) * (y & m);
+ return x + y + 2 * xy;
+}
+
+#define G(a, b, c, d) \
+ do { \
+ a = fBlaMka(a, b); \
+ d = rotr64(d ^ a, 32); \
+ c = fBlaMka(c, d); \
+ b = rotr64(b ^ c, 24); \
+ a = fBlaMka(a, b); \
+ d = rotr64(d ^ a, 16); \
+ c = fBlaMka(c, d); \
+ b = rotr64(b ^ c, 63); \
+ } while ((void)0, 0)
+
+#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \
+ v12, v13, v14, v15) \
+ do { \
+ G(v0, v4, v8, v12); \
+ G(v1, v5, v9, v13); \
+ G(v2, v6, v10, v14); \
+ G(v3, v7, v11, v15); \
+ G(v0, v5, v10, v15); \
+ G(v1, v6, v11, v12); \
+ G(v2, v7, v8, v13); \
+ G(v3, v4, v9, v14); \
+ } while ((void)0, 0)
+
+#endif