/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifdef USE_HW_SHA1

#ifndef __ARM_FEATURE_CRYPTO
#error "Compiler option is invalid"
#endif

#ifdef FREEBL_NO_DEPEND
#include "stubs.h"
#endif

#include <arm_neon.h>
#include <memory.h>
#include "blapi.h"
#include "sha_fast.h"

#if !defined(SHA_PUT_W_IN_STACK)
#define H2X 11
#else
#define H2X 0
#endif

static void shaCompress(SHA_HW_t *X, const PRUint32 *datain);

void
SHA1_Compress_Native(SHA1Context *ctx)
{
    shaCompress(&ctx->H[H2X], ctx->u.w);
}

/*
 *  SHA: Add data to context.
 */
void
SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len)
{
    unsigned int lenB;
    unsigned int togo;

    if (!len) {
        return;
    }

    /* accumulate the byte count. */
    lenB = (unsigned int)(ctx->size) & 63U;

    ctx->size += len;

    /*
   *  Read the data into W and process blocks as they get full
   */
    if (lenB > 0) {
        togo = 64U - lenB;
        if (len < togo) {
            togo = len;
        }
        memcpy(ctx->u.b + lenB, dataIn, togo);
        len -= togo;
        dataIn += togo;
        lenB = (lenB + togo) & 63U;
        if (!lenB) {
            shaCompress(&ctx->H[H2X], ctx->u.w);
        }
    }

    while (len >= 64U) {
        len -= 64U;
        shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn);
        dataIn += 64U;
    }

    if (len) {
        memcpy(ctx->u.b, dataIn, len);
    }
}

/*
 *  SHA: Compression function, unrolled.
 */
static void
shaCompress(SHA_HW_t *X, const PRUint32 *inbuf)
{
#define XH(n) X[n - H2X]

    const uint32x4_t K0 = vdupq_n_u32(0x5a827999);
    const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1);
    const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc);
    const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6);

    uint32x4_t abcd = vld1q_u32(&XH(0));
    PRUint32 e = XH(4);

    const uint32x4_t origABCD = abcd;
    const PRUint32 origE = e;

    uint32x4_t w0 = vld1q_u32(inbuf);
    uint32x4_t w1 = vld1q_u32(inbuf + 4);
    uint32x4_t w2 = vld1q_u32(inbuf + 8);
    uint32x4_t w3 = vld1q_u32(inbuf + 12);

    w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
    w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
    w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
    w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));

    uint32x4_t t0 = vaddq_u32(w0, K0);
    uint32x4_t t1 = vaddq_u32(w1, K0);

    PRUint32 tmpE;

    /*
         * Using the following ARM instructions to accelerate SHA1
         *
         * sha1c for round 0 - 20
         * sha1p for round 20 - 40
         * sha1m for round 40 - 60
         * sha1p for round 60 - 80
         * sha1su0 and shasu1 for message schedule
         * sha1h for rotate left 30
         */

    /* Round 0-3 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1cq_u32(abcd, e, t0);
    t0 = vaddq_u32(w2, K0);
    w0 = vsha1su0q_u32(w0, w1, w2);

    /* Round 4-7 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1cq_u32(abcd, tmpE, t1);
    t1 = vaddq_u32(w3, K0);
    w0 = vsha1su1q_u32(w0, w3);
    w1 = vsha1su0q_u32(w1, w2, w3);

    /* Round 8-11 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1cq_u32(abcd, e, t0);
    t0 = vaddq_u32(w0, K0);
    w1 = vsha1su1q_u32(w1, w0);
    w2 = vsha1su0q_u32(w2, w3, w0);

    /* Round 12-15 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1cq_u32(abcd, tmpE, t1);
    t1 = vaddq_u32(w1, K1);
    w2 = vsha1su1q_u32(w2, w1);
    w3 = vsha1su0q_u32(w3, w0, w1);

    /* Round 16-19 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1cq_u32(abcd, e, t0);
    t0 = vaddq_u32(w2, K1);
    w3 = vsha1su1q_u32(w3, w2);
    w0 = vsha1su0q_u32(w0, w1, w2);

    /* Round 20-23 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, tmpE, t1);
    t1 = vaddq_u32(w3, K1);
    w0 = vsha1su1q_u32(w0, w3);
    w1 = vsha1su0q_u32(w1, w2, w3);

    /* Round 24-27 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, e, t0);
    t0 = vaddq_u32(w0, K1);
    w1 = vsha1su1q_u32(w1, w0);
    w2 = vsha1su0q_u32(w2, w3, w0);

    /* Round 28-31 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, tmpE, t1);
    t1 = vaddq_u32(w1, K1);
    w2 = vsha1su1q_u32(w2, w1);
    w3 = vsha1su0q_u32(w3, w0, w1);

    /* Round 32-35 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, e, t0);
    t0 = vaddq_u32(w2, K2);
    w3 = vsha1su1q_u32(w3, w2);
    w0 = vsha1su0q_u32(w0, w1, w2);

    /* Round 36-39 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, tmpE, t1);
    t1 = vaddq_u32(w3, K2);
    w0 = vsha1su1q_u32(w0, w3);
    w1 = vsha1su0q_u32(w1, w2, w3);

    /* Round 40-43 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1mq_u32(abcd, e, t0);
    t0 = vaddq_u32(w0, K2);
    w1 = vsha1su1q_u32(w1, w0);
    w2 = vsha1su0q_u32(w2, w3, w0);

    /* Round 44-47 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1mq_u32(abcd, tmpE, t1);
    t1 = vaddq_u32(w1, K2);
    w2 = vsha1su1q_u32(w2, w1);
    w3 = vsha1su0q_u32(w3, w0, w1);

    /* Round 48-51 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1mq_u32(abcd, e, t0);
    t0 = vaddq_u32(w2, K2);
    w3 = vsha1su1q_u32(w3, w2);
    w0 = vsha1su0q_u32(w0, w1, w2);

    /* Round 52-55 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1mq_u32(abcd, tmpE, t1);
    t1 = vaddq_u32(w3, K3);
    w0 = vsha1su1q_u32(w0, w3);
    w1 = vsha1su0q_u32(w1, w2, w3);

    /* Round 56-59 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1mq_u32(abcd, e, t0);
    t0 = vaddq_u32(w0, K3);
    w1 = vsha1su1q_u32(w1, w0);
    w2 = vsha1su0q_u32(w2, w3, w0);

    /* Round 60-63 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, tmpE, t1);
    t1 = vaddq_u32(w1, K3);
    w2 = vsha1su1q_u32(w2, w1);
    w3 = vsha1su0q_u32(w3, w0, w1);

    /* Round 64-67 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, e, t0);
    t0 = vaddq_u32(w2, K3);
    w3 = vsha1su1q_u32(w3, w2);
    w0 = vsha1su0q_u32(w0, w1, w2);

    /* Round 68-71 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, tmpE, t1);
    t1 = vaddq_u32(w3, K3);
    w0 = vsha1su1q_u32(w0, w3);

    /* Round 72-75 */
    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, e, t0);

    /* Round 76-79 */
    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    abcd = vsha1pq_u32(abcd, tmpE, t1);

    e += origE;
    abcd = vaddq_u32(origABCD, abcd);

    vst1q_u32(&XH(0), abcd);
    XH(4) = e;
}

#endif /* USE_HW_SHA1 */