1 files changed, 264 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/sha1-armv8.c b/security/nss/lib/freebl/sha1-armv8.c
new file mode 100644
index 0000000000..63e4dad33e
--- /dev/null
+++ b/security/nss/lib/freebl/sha1-armv8.c
@@ -0,0 +1,264 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef USE_HW_SHA1
+
+#ifndef __ARM_FEATURE_CRYPTO
+#error "Compiler option is invalid"
+#endif
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+
+#include <arm_neon.h>
+#include <memory.h>
+#include "blapi.h"
+#include "sha_fast.h"
+
+#if !defined(SHA_PUT_W_IN_STACK)
+#define H2X 11
+#else
+#define H2X 0
+#endif
+
+static void shaCompress(SHA_HW_t *X, const PRUint32 *datain);
+
+void
+SHA1_Compress_Native(SHA1Context *ctx)
+{
+    shaCompress(&ctx->H[H2X], ctx->u.w);
+}
+
+/*
+ *  SHA: Add data to context.
+ */
+void
+SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len)
+{
+    unsigned int lenB;
+    unsigned int togo;
+
+    if (!len) {
+        return;
+    }
+
+    /* accumulate the byte count. */
+    lenB = (unsigned int)(ctx->size) & 63U;
+
+    ctx->size += len;
+
+    /*
+   *  Read the data into W and process blocks as they get full
+   */
+    if (lenB > 0) {
+        togo = 64U - lenB;
+        if (len < togo) {
+            togo = len;
+        }
+        memcpy(ctx->u.b + lenB, dataIn, togo);
+        len -= togo;
+        dataIn += togo;
+        lenB = (lenB + togo) & 63U;
+        if (!lenB) {
+            shaCompress(&ctx->H[H2X], ctx->u.w);
+        }
+    }
+
+    while (len >= 64U) {
+        len -= 64U;
+        shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn);
+        dataIn += 64U;
+    }
+
+    if (len) {
+        memcpy(ctx->u.b, dataIn, len);
+    }
+}
+
+/*
+ *  SHA: Compression function, unrolled.
+ */
+static void
+shaCompress(SHA_HW_t *X, const PRUint32 *inbuf)
+{
+#define XH(n) X[n - H2X]
+
+    const uint32x4_t K0 = vdupq_n_u32(0x5a827999);
+    const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1);
+    const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc);
+    const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6);
+
+    uint32x4_t abcd = vld1q_u32(&XH(0));
+    PRUint32 e = XH(4);
+
+    const uint32x4_t origABCD = abcd;
+    const PRUint32 origE = e;
+
+    uint32x4_t w0 = vld1q_u32(inbuf);
+    uint32x4_t w1 = vld1q_u32(inbuf + 4);
+    uint32x4_t w2 = vld1q_u32(inbuf + 8);
+    uint32x4_t w3 = vld1q_u32(inbuf + 12);
+
+    w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
+    w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
+    w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
+    w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));
+
+    uint32x4_t t0 = vaddq_u32(w0, K0);
+    uint32x4_t t1 = vaddq_u32(w1, K0);
+
+    PRUint32 tmpE;
+
+    /*
+         * Using the following ARM instructions to accelerate SHA1
+         *
+         * sha1c for round 0 - 20
+         * sha1p for round 20 - 40
+         * sha1m for round 40 - 60
+         * sha1p for round 60 - 80
+         * sha1su0 and shasu1 for message schedule
+         * sha1h for rotate left 30
+         */
+
+    /* Round 0-3 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1cq_u32(abcd, e, t0);
+    t0 = vaddq_u32(w2, K0);
+    w0 = vsha1su0q_u32(w0, w1, w2);
+
+    /* Round 4-7 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1cq_u32(abcd, tmpE, t1);
+    t1 = vaddq_u32(w3, K0);
+    w0 = vsha1su1q_u32(w0, w3);
+    w1 = vsha1su0q_u32(w1, w2, w3);
+
+    /* Round 8-11 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1cq_u32(abcd, e, t0);
+    t0 = vaddq_u32(w0, K0);
+    w1 = vsha1su1q_u32(w1, w0);
+    w2 = vsha1su0q_u32(w2, w3, w0);
+
+    /* Round 12-15 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1cq_u32(abcd, tmpE, t1);
+    t1 = vaddq_u32(w1, K1);
+    w2 = vsha1su1q_u32(w2, w1);
+    w3 = vsha1su0q_u32(w3, w0, w1);
+
+    /* Round 16-19 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1cq_u32(abcd, e, t0);
+    t0 = vaddq_u32(w2, K1);
+    w3 = vsha1su1q_u32(w3, w2);
+    w0 = vsha1su0q_u32(w0, w1, w2);
+
+    /* Round 20-23 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, tmpE, t1);
+    t1 = vaddq_u32(w3, K1);
+    w0 = vsha1su1q_u32(w0, w3);
+    w1 = vsha1su0q_u32(w1, w2, w3);
+
+    /* Round 24-27 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, e, t0);
+    t0 = vaddq_u32(w0, K1);
+    w1 = vsha1su1q_u32(w1, w0);
+    w2 = vsha1su0q_u32(w2, w3, w0);
+
+    /* Round 28-31 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, tmpE, t1);
+    t1 = vaddq_u32(w1, K1);
+    w2 = vsha1su1q_u32(w2, w1);
+    w3 = vsha1su0q_u32(w3, w0, w1);
+
+    /* Round 32-35 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, e, t0);
+    t0 = vaddq_u32(w2, K2);
+    w3 = vsha1su1q_u32(w3, w2);
+    w0 = vsha1su0q_u32(w0, w1, w2);
+
+    /* Round 36-39 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, tmpE, t1);
+    t1 = vaddq_u32(w3, K2);
+    w0 = vsha1su1q_u32(w0, w3);
+    w1 = vsha1su0q_u32(w1, w2, w3);
+
+    /* Round 40-43 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1mq_u32(abcd, e, t0);
+    t0 = vaddq_u32(w0, K2);
+    w1 = vsha1su1q_u32(w1, w0);
+    w2 = vsha1su0q_u32(w2, w3, w0);
+
+    /* Round 44-47 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1mq_u32(abcd, tmpE, t1);
+    t1 = vaddq_u32(w1, K2);
+    w2 = vsha1su1q_u32(w2, w1);
+    w3 = vsha1su0q_u32(w3, w0, w1);
+
+    /* Round 48-51 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1mq_u32(abcd, e, t0);
+    t0 = vaddq_u32(w2, K2);
+    w3 = vsha1su1q_u32(w3, w2);
+    w0 = vsha1su0q_u32(w0, w1, w2);
+
+    /* Round 52-55 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1mq_u32(abcd, tmpE, t1);
+    t1 = vaddq_u32(w3, K3);
+    w0 = vsha1su1q_u32(w0, w3);
+    w1 = vsha1su0q_u32(w1, w2, w3);
+
+    /* Round 56-59 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1mq_u32(abcd, e, t0);
+    t0 = vaddq_u32(w0, K3);
+    w1 = vsha1su1q_u32(w1, w0);
+    w2 = vsha1su0q_u32(w2, w3, w0);
+
+    /* Round 60-63 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, tmpE, t1);
+    t1 = vaddq_u32(w1, K3);
+    w2 = vsha1su1q_u32(w2, w1);
+    w3 = vsha1su0q_u32(w3, w0, w1);
+
+    /* Round 64-67 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, e, t0);
+    t0 = vaddq_u32(w2, K3);
+    w3 = vsha1su1q_u32(w3, w2);
+    w0 = vsha1su0q_u32(w0, w1, w2);
+
+    /* Round 68-71 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, tmpE, t1);
+    t1 = vaddq_u32(w3, K3);
+    w0 = vsha1su1q_u32(w0, w3);
+
+    /* Round 72-75 */
+    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, e, t0);
+
+    /* Round 76-79 */
+    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+    abcd = vsha1pq_u32(abcd, tmpE, t1);
+
+    e += origE;
+    abcd = vaddq_u32(origABCD, abcd);
+
+    vst1q_u32(&XH(0), abcd);
+    XH(4) = e;
+}
+
+#endif /* USE_HW_SHA1 */