5 files changed, 1162 insertions, 0 deletions
diff --git a/lib/crypto_backend/argon2/blake2/blake2-impl.h b/lib/crypto_backend/argon2/blake2/blake2-impl.h
new file mode 100644
index 0000000..dcac827
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blake2-impl.h
@@ -0,0 +1,154 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef PORTABLE_BLAKE2_IMPL_H
+#define PORTABLE_BLAKE2_IMPL_H
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(_MSC_VER)
+#define BLAKE2_INLINE __inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define BLAKE2_INLINE __inline__
+#else
+#define BLAKE2_INLINE
+#endif
+
+/* Argon2 Team - Begin Code */
+/*
+   Not an exhaustive list, but should cover the majority of modern platforms
+   Additionally, the code will always be correct---this is only a performance
+   tweak.
+*/
+#if (defined(__BYTE_ORDER__) &&                                                \
+     (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) ||                           \
+    defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \
+    defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) ||       \
+    defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) ||                \
+    defined(_M_ARM)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+/* Argon2 Team - End Code */
+
+static BLAKE2_INLINE uint32_t load32(const void *src) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+    uint32_t w;
+    memcpy(&w, src, sizeof w);
+    return w;
+#else
+    const uint8_t *p = (const uint8_t *)src;
+    uint32_t w = *p++;
+    w |= (uint32_t)(*p++) << 8;
+    w |= (uint32_t)(*p++) << 16;
+    w |= (uint32_t)(*p++) << 24;
+    return w;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load64(const void *src) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+    uint64_t w;
+    memcpy(&w, src, sizeof w);
+    return w;
+#else
+    const uint8_t *p = (const uint8_t *)src;
+    uint64_t w = *p++;
+    w |= (uint64_t)(*p++) << 8;
+    w |= (uint64_t)(*p++) << 16;
+    w |= (uint64_t)(*p++) << 24;
+    w |= (uint64_t)(*p++) << 32;
+    w |= (uint64_t)(*p++) << 40;
+    w |= (uint64_t)(*p++) << 48;
+    w |= (uint64_t)(*p++) << 56;
+    return w;
+#endif
+}
+
+static BLAKE2_INLINE void store32(void *dst, uint32_t w) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+    memcpy(dst, &w, sizeof w);
+#else
+    uint8_t *p = (uint8_t *)dst;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+#endif
+}
+
+static BLAKE2_INLINE void store64(void *dst, uint64_t w) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+    memcpy(dst, &w, sizeof w);
+#else
+    uint8_t *p = (uint8_t *)dst;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load48(const void *src) {
+    const uint8_t *p = (const uint8_t *)src;
+    uint64_t w = *p++;
+    w |= (uint64_t)(*p++) << 8;
+    w |= (uint64_t)(*p++) << 16;
+    w |= (uint64_t)(*p++) << 24;
+    w |= (uint64_t)(*p++) << 32;
+    w |= (uint64_t)(*p++) << 40;
+    return w;
+}
+
+static BLAKE2_INLINE void store48(void *dst, uint64_t w) {
+    uint8_t *p = (uint8_t *)dst;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+}
+
+static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) {
+    return (w >> c) | (w << (32 - c));
+}
+
+static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {
+    return (w >> c) | (w << (64 - c));
+}
+
+#endif
diff --git a/lib/crypto_backend/argon2/blake2/blake2.h b/lib/crypto_backend/argon2/blake2/blake2.h
new file mode 100644
index 0000000..0c1b0ee
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blake2.h
@@ -0,0 +1,89 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef PORTABLE_BLAKE2_H
+#define PORTABLE_BLAKE2_H
+
+#include "../argon2.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+enum blake2b_constant {
+    BLAKE2B_BLOCKBYTES = 128,
+    BLAKE2B_OUTBYTES = 64,
+    BLAKE2B_KEYBYTES = 64,
+    BLAKE2B_SALTBYTES = 16,
+    BLAKE2B_PERSONALBYTES = 16
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2b_param {
+    uint8_t digest_length;                   /* 1 */
+    uint8_t key_length;                      /* 2 */
+    uint8_t fanout;                          /* 3 */
+    uint8_t depth;                           /* 4 */
+    uint32_t leaf_length;                    /* 8 */
+    uint64_t node_offset;                    /* 16 */
+    uint8_t node_depth;                      /* 17 */
+    uint8_t inner_length;                    /* 18 */
+    uint8_t reserved[14];                    /* 32 */
+    uint8_t salt[BLAKE2B_SALTBYTES];         /* 48 */
+    uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
+} blake2b_param;
+#pragma pack(pop)
+
+typedef struct __blake2b_state {
+    uint64_t h[8];
+    uint64_t t[2];
+    uint64_t f[2];
+    uint8_t buf[BLAKE2B_BLOCKBYTES];
+    unsigned buflen;
+    unsigned outlen;
+    uint8_t last_node;
+} blake2b_state;
+
+/* Ensure param structs have not been wrongly padded */
+/* Poor man's static_assert */
+enum {
+    blake2_size_check_0 = 1 / !!(CHAR_BIT == 8),
+    blake2_size_check_2 =
+        1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT)
+};
+
+/* Streaming API */
+ARGON2_LOCAL int blake2b_init(blake2b_state *S, size_t outlen);
+ARGON2_LOCAL int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
+                     size_t keylen);
+ARGON2_LOCAL int blake2b_init_param(blake2b_state *S, const blake2b_param *P);
+ARGON2_LOCAL int blake2b_update(blake2b_state *S, const void *in, size_t inlen);
+ARGON2_LOCAL int blake2b_final(blake2b_state *S, void *out, size_t outlen);
+
+/* Simple API */
+ARGON2_LOCAL int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
+                         const void *key, size_t keylen);
+
+/* Argon2 Team - Begin Code */
+ARGON2_LOCAL int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
+/* Argon2 Team - End Code */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/lib/crypto_backend/argon2/blake2/blake2b.c b/lib/crypto_backend/argon2/blake2/blake2b.c
new file mode 100644
index 0000000..d8f69e8
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blake2b.c
@@ -0,0 +1,392 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+void clear_internal_memory(void *v, size_t n);
+
+static const uint64_t blake2b_IV[8] = {
+    UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
+    UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
+    UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
+    UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179)};
+
+static const unsigned int blake2b_sigma[12][16] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+    {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+    {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+    {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+    {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+    {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+    {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+    {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+    {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+};
+
+static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) {
+    S->f[1] = (uint64_t)-1;
+}
+
+static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) {
+    if (S->last_node) {
+        blake2b_set_lastnode(S);
+    }
+    S->f[0] = (uint64_t)-1;
+}
+
+static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S,
+                                                    uint64_t inc) {
+    S->t[0] += inc;
+    S->t[1] += (S->t[0] < inc);
+}
+
+static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) {
+    clear_internal_memory(S, sizeof(*S));      /* wipe */
+    blake2b_set_lastblock(S); /* invalidate for further use */
+}
+
+static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) {
+    memset(S, 0, sizeof(*S));
+    memcpy(S->h, blake2b_IV, sizeof(S->h));
+}
+
+int blake2b_init_param(blake2b_state *S, const blake2b_param *P) {
+    const unsigned char *p = (const unsigned char *)P;
+    unsigned int i;
+
+    if (NULL == P || NULL == S) {
+        return -1;
+    }
+
+    blake2b_init0(S);
+    /* IV XOR Parameter Block */
+    for (i = 0; i < 8; ++i) {
+        S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
+    }
+    S->outlen = P->digest_length;
+    return 0;
+}
+
+/* Sequential blake2b initialization */
+int blake2b_init(blake2b_state *S, size_t outlen) {
+    blake2b_param P;
+
+    if (S == NULL) {
+        return -1;
+    }
+
+    if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) {
+        blake2b_invalidate_state(S);
+        return -1;
+    }
+
+    /* Setup Parameter Block for unkeyed BLAKE2 */
+    P.digest_length = (uint8_t)outlen;
+    P.key_length = 0;
+    P.fanout = 1;
+    P.depth = 1;
+    P.leaf_length = 0;
+    P.node_offset = 0;
+    P.node_depth = 0;
+    P.inner_length = 0;
+    memset(P.reserved, 0, sizeof(P.reserved));
+    memset(P.salt, 0, sizeof(P.salt));
+    memset(P.personal, 0, sizeof(P.personal));
+
+    return blake2b_init_param(S, &P);
+}
+
+int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
+                     size_t keylen) {
+    blake2b_param P;
+
+    if (S == NULL) {
+        return -1;
+    }
+
+    if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) {
+        blake2b_invalidate_state(S);
+        return -1;
+    }
+
+    if ((key == 0) || (keylen == 0) || (keylen > BLAKE2B_KEYBYTES)) {
+        blake2b_invalidate_state(S);
+        return -1;
+    }
+
+    /* Setup Parameter Block for keyed BLAKE2 */
+    P.digest_length = (uint8_t)outlen;
+    P.key_length = (uint8_t)keylen;
+    P.fanout = 1;
+    P.depth = 1;
+    P.leaf_length = 0;
+    P.node_offset = 0;
+    P.node_depth = 0;
+    P.inner_length = 0;
+    memset(P.reserved, 0, sizeof(P.reserved));
+    memset(P.salt, 0, sizeof(P.salt));
+    memset(P.personal, 0, sizeof(P.personal));
+
+    if (blake2b_init_param(S, &P) < 0) {
+        blake2b_invalidate_state(S);
+        return -1;
+    }
+
+    {
+        uint8_t block[BLAKE2B_BLOCKBYTES];
+        memset(block, 0, BLAKE2B_BLOCKBYTES);
+        memcpy(block, key, keylen);
+        blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
+        /* Burn the key from stack */
+        clear_internal_memory(block, BLAKE2B_BLOCKBYTES);
+    }
+    return 0;
+}
+
+static void blake2b_compress(blake2b_state *S, const uint8_t *block) {
+    uint64_t m[16];
+    uint64_t v[16];
+    unsigned int i, r;
+
+    for (i = 0; i < 16; ++i) {
+        m[i] = load64(block + i * sizeof(m[i]));
+    }
+
+    for (i = 0; i < 8; ++i) {
+        v[i] = S->h[i];
+    }
+
+    v[8] = blake2b_IV[0];
+    v[9] = blake2b_IV[1];
+    v[10] = blake2b_IV[2];
+    v[11] = blake2b_IV[3];
+    v[12] = blake2b_IV[4] ^ S->t[0];
+    v[13] = blake2b_IV[5] ^ S->t[1];
+    v[14] = blake2b_IV[6] ^ S->f[0];
+    v[15] = blake2b_IV[7] ^ S->f[1];
+
+#define G(r, i, a, b, c, d)                                                    \
+    do {                                                                       \
+        a = a + b + m[blake2b_sigma[r][2 * i + 0]];                            \
+        d = rotr64(d ^ a, 32);                                                 \
+        c = c + d;                                                             \
+        b = rotr64(b ^ c, 24);                                                 \
+        a = a + b + m[blake2b_sigma[r][2 * i + 1]];                            \
+        d = rotr64(d ^ a, 16);                                                 \
+        c = c + d;                                                             \
+        b = rotr64(b ^ c, 63);                                                 \
+    } while ((void)0, 0)
+
+#define ROUND(r)                                                               \
+    do {                                                                       \
+        G(r, 0, v[0], v[4], v[8], v[12]);                                      \
+        G(r, 1, v[1], v[5], v[9], v[13]);                                      \
+        G(r, 2, v[2], v[6], v[10], v[14]);                                     \
+        G(r, 3, v[3], v[7], v[11], v[15]);                                     \
+        G(r, 4, v[0], v[5], v[10], v[15]);                                     \
+        G(r, 5, v[1], v[6], v[11], v[12]);                                     \
+        G(r, 6, v[2], v[7], v[8], v[13]);                                      \
+        G(r, 7, v[3], v[4], v[9], v[14]);                                      \
+    } while ((void)0, 0)
+
+    for (r = 0; r < 12; ++r) {
+        ROUND(r);
+    }
+
+    for (i = 0; i < 8; ++i) {
+        S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+    }
+
+#undef G
+#undef ROUND
+}
+
+int blake2b_update(blake2b_state *S, const void *in, size_t inlen) {
+    const uint8_t *pin = (const uint8_t *)in;
+
+    if (inlen == 0) {
+        return 0;
+    }
+
+    /* Sanity check */
+    if (S == NULL || in == NULL) {
+        return -1;
+    }
+
+    /* Is this a reused state? */
+    if (S->f[0] != 0) {
+        return -1;
+    }
+
+    if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) {
+        /* Complete current block */
+        size_t left = S->buflen;
+        size_t fill = BLAKE2B_BLOCKBYTES - left;
+        memcpy(&S->buf[left], pin, fill);
+        blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+        blake2b_compress(S, S->buf);
+        S->buflen = 0;
+        inlen -= fill;
+        pin += fill;
+        /* Avoid buffer copies when possible */
+        while (inlen > BLAKE2B_BLOCKBYTES) {
+            blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+            blake2b_compress(S, pin);
+            inlen -= BLAKE2B_BLOCKBYTES;
+            pin += BLAKE2B_BLOCKBYTES;
+        }
+    }
+    memcpy(&S->buf[S->buflen], pin, inlen);
+    S->buflen += (unsigned int)inlen;
+    return 0;
+}
+
+int blake2b_final(blake2b_state *S, void *out, size_t outlen) {
+    uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
+    unsigned int i;
+
+    /* Sanity checks */
+    if (S == NULL || out == NULL || outlen < S->outlen) {
+        return -1;
+    }
+
+    /* Is this a reused state? */
+    if (S->f[0] != 0) {
+        return -1;
+    }
+
+    blake2b_increment_counter(S, S->buflen);
+    blake2b_set_lastblock(S);
+    memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
+    blake2b_compress(S, S->buf);
+
+    for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
+        store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
+    }
+
+    memcpy(out, buffer, S->outlen);
+    clear_internal_memory(buffer, sizeof(buffer));
+    clear_internal_memory(S->buf, sizeof(S->buf));
+    clear_internal_memory(S->h, sizeof(S->h));
+    return 0;
+}
+
+int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
+            const void *key, size_t keylen) {
+    blake2b_state S;
+    int ret = -1;
+
+    /* Verify parameters */
+    if (NULL == in && inlen > 0) {
+        goto fail;
+    }
+
+    if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) {
+        goto fail;
+    }
+
+    if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) {
+        goto fail;
+    }
+
+    if (keylen > 0) {
+        if (blake2b_init_key(&S, outlen, key, keylen) < 0) {
+            goto fail;
+        }
+    } else {
+        if (blake2b_init(&S, outlen) < 0) {
+            goto fail;
+        }
+    }
+
+    if (blake2b_update(&S, in, inlen) < 0) {
+        goto fail;
+    }
+    ret = blake2b_final(&S, out, outlen);
+
+fail:
+    clear_internal_memory(&S, sizeof(S));
+    return ret;
+}
+
+/* Argon2 Team - Begin Code */
+int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
+    uint8_t *out = (uint8_t *)pout;
+    blake2b_state blake_state;
+    uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
+    int ret = -1;
+
+    if (outlen > UINT32_MAX) {
+        goto fail;
+    }
+
+    /* Ensure little-endian byte order! */
+    store32(outlen_bytes, (uint32_t)outlen);
+
+#define TRY(statement)                                                         \
+    do {                                                                       \
+        ret = statement;                                                       \
+        if (ret < 0) {                                                         \
+            goto fail;                                                         \
+        }                                                                      \
+    } while ((void)0, 0)
+
+    if (outlen <= BLAKE2B_OUTBYTES) {
+        TRY(blake2b_init(&blake_state, outlen));
+        TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+        TRY(blake2b_update(&blake_state, in, inlen));
+        TRY(blake2b_final(&blake_state, out, outlen));
+    } else {
+        uint32_t toproduce;
+        uint8_t out_buffer[BLAKE2B_OUTBYTES];
+        uint8_t in_buffer[BLAKE2B_OUTBYTES];
+        TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
+        TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+        TRY(blake2b_update(&blake_state, in, inlen));
+        TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
+        memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+        out += BLAKE2B_OUTBYTES / 2;
+        toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
+
+        while (toproduce > BLAKE2B_OUTBYTES) {
+            memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+            TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
+                        BLAKE2B_OUTBYTES, NULL, 0));
+            memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+            out += BLAKE2B_OUTBYTES / 2;
+            toproduce -= BLAKE2B_OUTBYTES / 2;
+        }
+
+        memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+        TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
+                    0));
+        memcpy(out, out_buffer, toproduce);
+    }
+fail:
+    clear_internal_memory(&blake_state, sizeof(blake_state));
+    return ret;
+#undef TRY
+}
+/* Argon2 Team - End Code */
diff --git a/lib/crypto_backend/argon2/blake2/blamka-round-opt.h b/lib/crypto_backend/argon2/blake2/blamka-round-opt.h
new file mode 100644
index 0000000..3127f2a
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blamka-round-opt.h
@@ -0,0 +1,471 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef BLAKE_ROUND_MKA_OPT_H
+#define BLAKE_ROUND_MKA_OPT_H
+
+#include "blake2-impl.h"
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
+#endif
+
+#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
+#include <x86intrin.h>
+#endif
+
+#if !defined(__AVX512F__)
+#if !defined(__AVX2__)
+#if !defined(__XOP__)
+#if defined(__SSSE3__)
+#define r16                                                                    \
+    (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define r24                                                                    \
+    (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define _mm_roti_epi64(x, c)                                                   \
+    (-(c) == 32)                                                               \
+        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
+        : (-(c) == 24)                                                         \
+              ? _mm_shuffle_epi8((x), r24)                                     \
+              : (-(c) == 16)                                                   \
+                    ? _mm_shuffle_epi8((x), r16)                               \
+                    : (-(c) == 63)                                             \
+                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_add_epi64((x), (x)))             \
+                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_slli_epi64((x), 64 - (-(c))))
+#else /* defined(__SSE2__) */
+#define _mm_roti_epi64(r, c)                                                   \
+    _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
+#endif
+#else
+#endif
+
+static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
+    const __m128i z = _mm_mul_epu32(x, y);
+    return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -32);                                          \
+        D1 = _mm_roti_epi64(D1, -32);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -24);                                          \
+        B1 = _mm_roti_epi64(B1, -24);                                          \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -16);                                          \
+        D1 = _mm_roti_epi64(D1, -16);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -63);                                          \
+        B1 = _mm_roti_epi64(B1, -63);                                          \
+    } while ((void)0, 0)
+
+#if defined(__SSSE3__)
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+#else /* SSE2 */
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = D0;                                                       \
+        __m128i t1 = B0;                                                       \
+        D0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = D0;                                                               \
+        D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
+        D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
+        B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
+        B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0, t1;                                                        \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+        t0 = B0;                                                               \
+        t1 = D0;                                                               \
+        B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
+        B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
+        D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
+        D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+#endif
+
+#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
+    do {                                                                       \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
+                                                                               \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
+    } while ((void)0, 0)
+#else /* __AVX2__ */
+
+#include <immintrin.h>
+
+#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+
+#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr32(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr24(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr32(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr24(B1); \
+    } while((void)0, 0);
+
+#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr16(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr63(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr16(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr63(B1); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while(0);
+
+#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while((void)0, 0);
+
+#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    } while((void)0, 0);
+
+#endif /* __AVX2__ */
+
+#else /* __AVX512F__ */
+
+#include <immintrin.h>
+
+#define ror64(x, n) _mm512_ror_epi64((x), (n))
+
+static __m512i muladd(__m512i x, __m512i y)
+{
+    __m512i z = _mm512_mul_epu32(x, y);
+    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 32); \
+        D1 = ror64(D1, 32); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 24); \
+        B1 = ror64(B1, 24); \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 16); \
+        D1 = ror64(D1, 16); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 63); \
+        B1 = ror64(B1, 63); \
+    } while ((void)0, 0)
+
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+    } while ((void)0, 0)
+
+#define SWAP_HALVES(A0, A1) \
+    do { \
+        __m512i t0, t1; \
+        t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
+        t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
+        A0 = t0; \
+        A1 = t1; \
+    } while((void)0, 0)
+
+#define SWAP_QUARTERS(A0, A1) \
+    do { \
+        SWAP_HALVES(A0, A1); \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+    } while((void)0, 0)
+
+#define UNSWAP_QUARTERS(A0, A1) \
+    do { \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        SWAP_HALVES(A0, A1); \
+    } while((void)0, 0)
+
+#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
+    do { \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        SWAP_QUARTERS(A0, A1); \
+        SWAP_QUARTERS(B0, B1); \
+        SWAP_QUARTERS(C0, C1); \
+        SWAP_QUARTERS(D0, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        UNSWAP_QUARTERS(A0, A1); \
+        UNSWAP_QUARTERS(B0, B1); \
+        UNSWAP_QUARTERS(C0, C1); \
+        UNSWAP_QUARTERS(D0, D1); \
+    } while ((void)0, 0)
+
+#endif /* __AVX512F__ */
+#endif /* BLAKE_ROUND_MKA_OPT_H */
diff --git a/lib/crypto_backend/argon2/blake2/blamka-round-ref.h b/lib/crypto_backend/argon2/blake2/blamka-round-ref.h
new file mode 100644
index 0000000..16cfc1c
--- /dev/null
+++ b/lib/crypto_backend/argon2/blake2/blamka-round-ref.h
@@ -0,0 +1,56 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef BLAKE_ROUND_MKA_H
+#define BLAKE_ROUND_MKA_H
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+/* designed by the Lyra PHC team */
+static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
+    const uint64_t m = UINT64_C(0xFFFFFFFF);
+    const uint64_t xy = (x & m) * (y & m);
+    return x + y + 2 * xy;
+}
+
+#define G(a, b, c, d)                                                          \
+    do {                                                                       \
+        a = fBlaMka(a, b);                                                     \
+        d = rotr64(d ^ a, 32);                                                 \
+        c = fBlaMka(c, d);                                                     \
+        b = rotr64(b ^ c, 24);                                                 \
+        a = fBlaMka(a, b);                                                     \
+        d = rotr64(d ^ a, 16);                                                 \
+        c = fBlaMka(c, d);                                                     \
+        b = rotr64(b ^ c, 63);                                                 \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,   \
+                           v12, v13, v14, v15)                                 \
+    do {                                                                       \
+        G(v0, v4, v8, v12);                                                    \
+        G(v1, v5, v9, v13);                                                    \
+        G(v2, v6, v10, v14);                                                   \
+        G(v3, v7, v11, v15);                                                   \
+        G(v0, v5, v10, v15);                                                   \
+        G(v1, v6, v11, v12);                                                   \
+        G(v2, v7, v8, v13);                                                    \
+        G(v3, v4, v9, v14);                                                    \
+    } while ((void)0, 0)
+
+#endif