diff options
Diffstat (limited to 'security/nss/lib/freebl/sha1-armv8.c')
-rw-r--r-- | security/nss/lib/freebl/sha1-armv8.c | 264 |
1 files changed, 264 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/sha1-armv8.c b/security/nss/lib/freebl/sha1-armv8.c new file mode 100644 index 0000000000..63e4dad33e --- /dev/null +++ b/security/nss/lib/freebl/sha1-armv8.c @@ -0,0 +1,264 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef USE_HW_SHA1 + +#ifndef __ARM_FEATURE_CRYPTO +#error "Compiler option is invalid" +#endif + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif + +#include <arm_neon.h> +#include <memory.h> +#include "blapi.h" +#include "sha_fast.h" + +#if !defined(SHA_PUT_W_IN_STACK) +#define H2X 11 +#else +#define H2X 0 +#endif + +static void shaCompress(SHA_HW_t *X, const PRUint32 *datain); + +void +SHA1_Compress_Native(SHA1Context *ctx) +{ + shaCompress(&ctx->H[H2X], ctx->u.w); +} + +/* + * SHA: Add data to context. + */ +void +SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len) +{ + unsigned int lenB; + unsigned int togo; + + if (!len) { + return; + } + + /* accumulate the byte count. */ + lenB = (unsigned int)(ctx->size) & 63U; + + ctx->size += len; + + /* + * Read the data into W and process blocks as they get full + */ + if (lenB > 0) { + togo = 64U - lenB; + if (len < togo) { + togo = len; + } + memcpy(ctx->u.b + lenB, dataIn, togo); + len -= togo; + dataIn += togo; + lenB = (lenB + togo) & 63U; + if (!lenB) { + shaCompress(&ctx->H[H2X], ctx->u.w); + } + } + + while (len >= 64U) { + len -= 64U; + shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn); + dataIn += 64U; + } + + if (len) { + memcpy(ctx->u.b, dataIn, len); + } +} + +/* + * SHA: Compression function, unrolled. + */ +static void +shaCompress(SHA_HW_t *X, const PRUint32 *inbuf) +{ +#define XH(n) X[n - H2X] + + const uint32x4_t K0 = vdupq_n_u32(0x5a827999); + const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1); + const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc); + const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6); + + uint32x4_t abcd = vld1q_u32(&XH(0)); + PRUint32 e = XH(4); + + const uint32x4_t origABCD = abcd; + const PRUint32 origE = e; + + uint32x4_t w0 = vld1q_u32(inbuf); + uint32x4_t w1 = vld1q_u32(inbuf + 4); + uint32x4_t w2 = vld1q_u32(inbuf + 8); + uint32x4_t w3 = vld1q_u32(inbuf + 12); + + w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0))); + w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1))); + w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2))); + w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3))); + + uint32x4_t t0 = vaddq_u32(w0, K0); + uint32x4_t t1 = vaddq_u32(w1, K0); + + PRUint32 tmpE; + + /* + * Using the following ARM instructions to accelerate SHA1 + * + * sha1c for round 0 - 20 + * sha1p for round 20 - 40 + * sha1m for round 40 - 60 + * sha1p for round 60 - 80 + * sha1su0 and shasu1 for message schedule + * sha1h for rotate left 30 + */ + + /* Round 0-3 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1cq_u32(abcd, e, t0); + t0 = vaddq_u32(w2, K0); + w0 = vsha1su0q_u32(w0, w1, w2); + + /* Round 4-7 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1cq_u32(abcd, tmpE, t1); + t1 = vaddq_u32(w3, K0); + w0 = vsha1su1q_u32(w0, w3); + w1 = vsha1su0q_u32(w1, w2, w3); + + /* Round 8-11 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1cq_u32(abcd, e, t0); + t0 = vaddq_u32(w0, K0); + w1 = vsha1su1q_u32(w1, w0); + w2 = vsha1su0q_u32(w2, w3, w0); + + /* Round 12-15 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1cq_u32(abcd, tmpE, t1); + t1 = vaddq_u32(w1, K1); + w2 = vsha1su1q_u32(w2, w1); + w3 = vsha1su0q_u32(w3, w0, w1); + + /* Round 16-19 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1cq_u32(abcd, e, t0); + t0 = vaddq_u32(w2, K1); + w3 = vsha1su1q_u32(w3, w2); + w0 = vsha1su0q_u32(w0, w1, w2); + + /* Round 20-23 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, tmpE, t1); + t1 = vaddq_u32(w3, K1); + w0 = vsha1su1q_u32(w0, w3); + w1 = vsha1su0q_u32(w1, w2, w3); + + /* Round 24-27 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, e, t0); + t0 = vaddq_u32(w0, K1); + w1 = vsha1su1q_u32(w1, w0); + w2 = vsha1su0q_u32(w2, w3, w0); + + /* Round 28-31 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, tmpE, t1); + t1 = vaddq_u32(w1, K1); + w2 = vsha1su1q_u32(w2, w1); + w3 = vsha1su0q_u32(w3, w0, w1); + + /* Round 32-35 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, e, t0); + t0 = vaddq_u32(w2, K2); + w3 = vsha1su1q_u32(w3, w2); + w0 = vsha1su0q_u32(w0, w1, w2); + + /* Round 36-39 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, tmpE, t1); + t1 = vaddq_u32(w3, K2); + w0 = vsha1su1q_u32(w0, w3); + w1 = vsha1su0q_u32(w1, w2, w3); + + /* Round 40-43 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1mq_u32(abcd, e, t0); + t0 = vaddq_u32(w0, K2); + w1 = vsha1su1q_u32(w1, w0); + w2 = vsha1su0q_u32(w2, w3, w0); + + /* Round 44-47 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1mq_u32(abcd, tmpE, t1); + t1 = vaddq_u32(w1, K2); + w2 = vsha1su1q_u32(w2, w1); + w3 = vsha1su0q_u32(w3, w0, w1); + + /* Round 48-51 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1mq_u32(abcd, e, t0); + t0 = vaddq_u32(w2, K2); + w3 = vsha1su1q_u32(w3, w2); + w0 = vsha1su0q_u32(w0, w1, w2); + + /* Round 52-55 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1mq_u32(abcd, tmpE, t1); + t1 = vaddq_u32(w3, K3); + w0 = vsha1su1q_u32(w0, w3); + w1 = vsha1su0q_u32(w1, w2, w3); + + /* Round 56-59 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1mq_u32(abcd, e, t0); + t0 = vaddq_u32(w0, K3); + w1 = vsha1su1q_u32(w1, w0); + w2 = vsha1su0q_u32(w2, w3, w0); + + /* Round 60-63 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, tmpE, t1); + t1 = vaddq_u32(w1, K3); + w2 = vsha1su1q_u32(w2, w1); + w3 = vsha1su0q_u32(w3, w0, w1); + + /* Round 64-67 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, e, t0); + t0 = vaddq_u32(w2, K3); + w3 = vsha1su1q_u32(w3, w2); + w0 = vsha1su0q_u32(w0, w1, w2); + + /* Round 68-71 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, tmpE, t1); + t1 = vaddq_u32(w3, K3); + w0 = vsha1su1q_u32(w0, w3); + + /* Round 72-75 */ + tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, e, t0); + + /* Round 76-79 */ + e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); + abcd = vsha1pq_u32(abcd, tmpE, t1); + + e += origE; + abcd = vaddq_u32(origABCD, abcd); + + vst1q_u32(&XH(0), abcd); + XH(4) = e; +} + +#endif /* USE_HW_SHA1 */ |