/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifdef USE_HW_SHA1 #ifndef __ARM_FEATURE_CRYPTO #error "Compiler option is invalid" #endif #ifdef FREEBL_NO_DEPEND #include "stubs.h" #endif #include #include #include "blapi.h" #include "sha_fast.h" #if !defined(SHA_PUT_W_IN_STACK) #define H2X 11 #else #define H2X 0 #endif static void shaCompress(SHA_HW_t *X, const PRUint32 *datain); void SHA1_Compress_Native(SHA1Context *ctx) { shaCompress(&ctx->H[H2X], ctx->u.w); } /* * SHA: Add data to context. */ void SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len) { unsigned int lenB; unsigned int togo; if (!len) { return; } /* accumulate the byte count. */ lenB = (unsigned int)(ctx->size) & 63U; ctx->size += len; /* * Read the data into W and process blocks as they get full */ if (lenB > 0) { togo = 64U - lenB; if (len < togo) { togo = len; } memcpy(ctx->u.b + lenB, dataIn, togo); len -= togo; dataIn += togo; lenB = (lenB + togo) & 63U; if (!lenB) { shaCompress(&ctx->H[H2X], ctx->u.w); } } while (len >= 64U) { len -= 64U; shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn); dataIn += 64U; } if (len) { memcpy(ctx->u.b, dataIn, len); } } /* * SHA: Compression function, unrolled. */ static void shaCompress(SHA_HW_t *X, const PRUint32 *inbuf) { #define XH(n) X[n - H2X] const uint32x4_t K0 = vdupq_n_u32(0x5a827999); const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1); const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc); const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6); uint32x4_t abcd = vld1q_u32(&XH(0)); PRUint32 e = XH(4); const uint32x4_t origABCD = abcd; const PRUint32 origE = e; uint32x4_t w0 = vld1q_u32(inbuf); uint32x4_t w1 = vld1q_u32(inbuf + 4); uint32x4_t w2 = vld1q_u32(inbuf + 8); uint32x4_t w3 = vld1q_u32(inbuf + 12); w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0))); w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1))); w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2))); w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3))); uint32x4_t t0 = vaddq_u32(w0, K0); uint32x4_t t1 = vaddq_u32(w1, K0); PRUint32 tmpE; /* * Using the following ARM instructions to accelerate SHA1 * * sha1c for round 0 - 20 * sha1p for round 20 - 40 * sha1m for round 40 - 60 * sha1p for round 60 - 80 * sha1su0 and shasu1 for message schedule * sha1h for rotate left 30 */ /* Round 0-3 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1cq_u32(abcd, e, t0); t0 = vaddq_u32(w2, K0); w0 = vsha1su0q_u32(w0, w1, w2); /* Round 4-7 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1cq_u32(abcd, tmpE, t1); t1 = vaddq_u32(w3, K0); w0 = vsha1su1q_u32(w0, w3); w1 = vsha1su0q_u32(w1, w2, w3); /* Round 8-11 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1cq_u32(abcd, e, t0); t0 = vaddq_u32(w0, K0); w1 = vsha1su1q_u32(w1, w0); w2 = vsha1su0q_u32(w2, w3, w0); /* Round 12-15 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1cq_u32(abcd, tmpE, t1); t1 = vaddq_u32(w1, K1); w2 = vsha1su1q_u32(w2, w1); w3 = vsha1su0q_u32(w3, w0, w1); /* Round 16-19 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1cq_u32(abcd, e, t0); t0 = vaddq_u32(w2, K1); w3 = vsha1su1q_u32(w3, w2); w0 = vsha1su0q_u32(w0, w1, w2); /* Round 20-23 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, tmpE, t1); t1 = vaddq_u32(w3, K1); w0 = vsha1su1q_u32(w0, w3); w1 = vsha1su0q_u32(w1, w2, w3); /* Round 24-27 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e, t0); t0 = vaddq_u32(w0, K1); w1 = vsha1su1q_u32(w1, w0); w2 = vsha1su0q_u32(w2, w3, w0); /* Round 28-31 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, tmpE, t1); t1 = vaddq_u32(w1, K1); w2 = vsha1su1q_u32(w2, w1); w3 = vsha1su0q_u32(w3, w0, w1); /* Round 32-35 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e, t0); t0 = vaddq_u32(w2, K2); w3 = vsha1su1q_u32(w3, w2); w0 = vsha1su0q_u32(w0, w1, w2); /* Round 36-39 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, tmpE, t1); t1 = vaddq_u32(w3, K2); w0 = vsha1su1q_u32(w0, w3); w1 = vsha1su0q_u32(w1, w2, w3); /* Round 40-43 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, e, t0); t0 = vaddq_u32(w0, K2); w1 = vsha1su1q_u32(w1, w0); w2 = vsha1su0q_u32(w2, w3, w0); /* Round 44-47 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, tmpE, t1); t1 = vaddq_u32(w1, K2); w2 = vsha1su1q_u32(w2, w1); w3 = vsha1su0q_u32(w3, w0, w1); /* Round 48-51 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, e, t0); t0 = vaddq_u32(w2, K2); w3 = vsha1su1q_u32(w3, w2); w0 = vsha1su0q_u32(w0, w1, w2); /* Round 52-55 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, tmpE, t1); t1 = vaddq_u32(w3, K3); w0 = vsha1su1q_u32(w0, w3); w1 = vsha1su0q_u32(w1, w2, w3); /* Round 56-59 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1mq_u32(abcd, e, t0); t0 = vaddq_u32(w0, K3); w1 = vsha1su1q_u32(w1, w0); w2 = vsha1su0q_u32(w2, w3, w0); /* Round 60-63 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, tmpE, t1); t1 = vaddq_u32(w1, K3); w2 = vsha1su1q_u32(w2, w1); w3 = vsha1su0q_u32(w3, w0, w1); /* Round 64-67 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e, t0); t0 = vaddq_u32(w2, K3); w3 = vsha1su1q_u32(w3, w2); w0 = vsha1su0q_u32(w0, w1, w2); /* Round 68-71 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, tmpE, t1); t1 = vaddq_u32(w3, K3); w0 = vsha1su1q_u32(w0, w3); /* Round 72-75 */ tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, e, t0); /* Round 76-79 */ e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); abcd = vsha1pq_u32(abcd, tmpE, t1); e += origE; abcd = vaddq_u32(origABCD, abcd); vst1q_u32(&XH(0), abcd); XH(4) = e; } #endif /* USE_HW_SHA1 */