summaryrefslogtreecommitdiffstats
path: root/security/nss/lib/freebl/sha1-armv8.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--security/nss/lib/freebl/sha1-armv8.c264
1 files changed, 264 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/sha1-armv8.c b/security/nss/lib/freebl/sha1-armv8.c
new file mode 100644
index 0000000000..63e4dad33e
--- /dev/null
+++ b/security/nss/lib/freebl/sha1-armv8.c
@@ -0,0 +1,264 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef USE_HW_SHA1
+
+#ifndef __ARM_FEATURE_CRYPTO
+#error "Compiler option is invalid"
+#endif
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+
+#include <arm_neon.h>
+#include <memory.h>
+#include "blapi.h"
+#include "sha_fast.h"
+
+#if !defined(SHA_PUT_W_IN_STACK)
+#define H2X 11
+#else
+#define H2X 0
+#endif
+
+static void shaCompress(SHA_HW_t *X, const PRUint32 *datain);
+
+void
+SHA1_Compress_Native(SHA1Context *ctx)
+{
+ shaCompress(&ctx->H[H2X], ctx->u.w);
+}
+
+/*
+ * SHA: Add data to context.
+ */
+void
+SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len)
+{
+ unsigned int lenB;
+ unsigned int togo;
+
+ if (!len) {
+ return;
+ }
+
+ /* accumulate the byte count. */
+ lenB = (unsigned int)(ctx->size) & 63U;
+
+ ctx->size += len;
+
+ /*
+ * Read the data into W and process blocks as they get full
+ */
+ if (lenB > 0) {
+ togo = 64U - lenB;
+ if (len < togo) {
+ togo = len;
+ }
+ memcpy(ctx->u.b + lenB, dataIn, togo);
+ len -= togo;
+ dataIn += togo;
+ lenB = (lenB + togo) & 63U;
+ if (!lenB) {
+ shaCompress(&ctx->H[H2X], ctx->u.w);
+ }
+ }
+
+ while (len >= 64U) {
+ len -= 64U;
+ shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn);
+ dataIn += 64U;
+ }
+
+ if (len) {
+ memcpy(ctx->u.b, dataIn, len);
+ }
+}
+
+/*
+ * SHA: Compression function, unrolled.
+ */
+static void
+shaCompress(SHA_HW_t *X, const PRUint32 *inbuf)
+{
+#define XH(n) X[n - H2X]
+
+ const uint32x4_t K0 = vdupq_n_u32(0x5a827999);
+ const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1);
+ const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc);
+ const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6);
+
+ uint32x4_t abcd = vld1q_u32(&XH(0));
+ PRUint32 e = XH(4);
+
+ const uint32x4_t origABCD = abcd;
+ const PRUint32 origE = e;
+
+ uint32x4_t w0 = vld1q_u32(inbuf);
+ uint32x4_t w1 = vld1q_u32(inbuf + 4);
+ uint32x4_t w2 = vld1q_u32(inbuf + 8);
+ uint32x4_t w3 = vld1q_u32(inbuf + 12);
+
+ w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
+ w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
+ w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
+ w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));
+
+ uint32x4_t t0 = vaddq_u32(w0, K0);
+ uint32x4_t t1 = vaddq_u32(w1, K0);
+
+ PRUint32 tmpE;
+
+ /*
+ * Using the following ARM instructions to accelerate SHA1
+ *
+ * sha1c for round 0 - 20
+ * sha1p for round 20 - 40
+ * sha1m for round 40 - 60
+ * sha1p for round 60 - 80
+ * sha1su0 and shasu1 for message schedule
+ * sha1h for rotate left 30
+ */
+
+ /* Round 0-3 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1cq_u32(abcd, e, t0);
+ t0 = vaddq_u32(w2, K0);
+ w0 = vsha1su0q_u32(w0, w1, w2);
+
+ /* Round 4-7 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1cq_u32(abcd, tmpE, t1);
+ t1 = vaddq_u32(w3, K0);
+ w0 = vsha1su1q_u32(w0, w3);
+ w1 = vsha1su0q_u32(w1, w2, w3);
+
+ /* Round 8-11 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1cq_u32(abcd, e, t0);
+ t0 = vaddq_u32(w0, K0);
+ w1 = vsha1su1q_u32(w1, w0);
+ w2 = vsha1su0q_u32(w2, w3, w0);
+
+ /* Round 12-15 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1cq_u32(abcd, tmpE, t1);
+ t1 = vaddq_u32(w1, K1);
+ w2 = vsha1su1q_u32(w2, w1);
+ w3 = vsha1su0q_u32(w3, w0, w1);
+
+ /* Round 16-19 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1cq_u32(abcd, e, t0);
+ t0 = vaddq_u32(w2, K1);
+ w3 = vsha1su1q_u32(w3, w2);
+ w0 = vsha1su0q_u32(w0, w1, w2);
+
+ /* Round 20-23 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, tmpE, t1);
+ t1 = vaddq_u32(w3, K1);
+ w0 = vsha1su1q_u32(w0, w3);
+ w1 = vsha1su0q_u32(w1, w2, w3);
+
+ /* Round 24-27 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, e, t0);
+ t0 = vaddq_u32(w0, K1);
+ w1 = vsha1su1q_u32(w1, w0);
+ w2 = vsha1su0q_u32(w2, w3, w0);
+
+ /* Round 28-31 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, tmpE, t1);
+ t1 = vaddq_u32(w1, K1);
+ w2 = vsha1su1q_u32(w2, w1);
+ w3 = vsha1su0q_u32(w3, w0, w1);
+
+ /* Round 32-35 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, e, t0);
+ t0 = vaddq_u32(w2, K2);
+ w3 = vsha1su1q_u32(w3, w2);
+ w0 = vsha1su0q_u32(w0, w1, w2);
+
+ /* Round 36-39 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, tmpE, t1);
+ t1 = vaddq_u32(w3, K2);
+ w0 = vsha1su1q_u32(w0, w3);
+ w1 = vsha1su0q_u32(w1, w2, w3);
+
+ /* Round 40-43 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1mq_u32(abcd, e, t0);
+ t0 = vaddq_u32(w0, K2);
+ w1 = vsha1su1q_u32(w1, w0);
+ w2 = vsha1su0q_u32(w2, w3, w0);
+
+ /* Round 44-47 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1mq_u32(abcd, tmpE, t1);
+ t1 = vaddq_u32(w1, K2);
+ w2 = vsha1su1q_u32(w2, w1);
+ w3 = vsha1su0q_u32(w3, w0, w1);
+
+ /* Round 48-51 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1mq_u32(abcd, e, t0);
+ t0 = vaddq_u32(w2, K2);
+ w3 = vsha1su1q_u32(w3, w2);
+ w0 = vsha1su0q_u32(w0, w1, w2);
+
+ /* Round 52-55 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1mq_u32(abcd, tmpE, t1);
+ t1 = vaddq_u32(w3, K3);
+ w0 = vsha1su1q_u32(w0, w3);
+ w1 = vsha1su0q_u32(w1, w2, w3);
+
+ /* Round 56-59 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1mq_u32(abcd, e, t0);
+ t0 = vaddq_u32(w0, K3);
+ w1 = vsha1su1q_u32(w1, w0);
+ w2 = vsha1su0q_u32(w2, w3, w0);
+
+ /* Round 60-63 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, tmpE, t1);
+ t1 = vaddq_u32(w1, K3);
+ w2 = vsha1su1q_u32(w2, w1);
+ w3 = vsha1su0q_u32(w3, w0, w1);
+
+ /* Round 64-67 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, e, t0);
+ t0 = vaddq_u32(w2, K3);
+ w3 = vsha1su1q_u32(w3, w2);
+ w0 = vsha1su0q_u32(w0, w1, w2);
+
+ /* Round 68-71 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, tmpE, t1);
+ t1 = vaddq_u32(w3, K3);
+ w0 = vsha1su1q_u32(w0, w3);
+
+ /* Round 72-75 */
+ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, e, t0);
+
+ /* Round 76-79 */
+ e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
+ abcd = vsha1pq_u32(abcd, tmpE, t1);
+
+ e += origE;
+ abcd = vaddq_u32(origABCD, abcd);
+
+ vst1q_u32(&XH(0), abcd);
+ XH(4) = e;
+}
+
+#endif /* USE_HW_SHA1 */