1 files changed, 312 insertions, 0 deletions
diff --git a/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S b/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S
new file mode 100644
index 0000000000..146fc9cfb2
--- /dev/null
+++ b/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S
@@ -0,0 +1,312 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
+// - RISC-V Vector AES block cipher extension ('Zvkned')
+// - RISC-V Vector Bit-manipulation extension ('Zvbb')
+// - RISC-V Vector GCM/GMAC extension ('Zvkg')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkned, +zvbb, +zvkg
+
+#include "aes-macros.S"
+
+#define KEYP		a0
+#define INP		a1
+#define OUTP		a2
+#define LEN		a3
+#define TWEAKP		a4
+
+#define LEN32		a5
+#define TAIL_LEN	a6
+#define VL		a7
+#define VLMAX		t4
+
+// v1-v15 contain the AES round keys, but they are used for temporaries before
+// the AES round keys have been loaded.
+#define TWEAKS		v16	// LMUL=4 (most of the time)
+#define TWEAKS_BREV	v20	// LMUL=4 (most of the time)
+#define MULTS_BREV	v24	// LMUL=4 (most of the time)
+#define TMP0		v28
+#define TMP1		v29
+#define TMP2		v30
+#define TMP3		v31
+
+// xts_init initializes the following values:
+//
+//	TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
+//	TWEAKS_BREV: same as TWEAKS, but bit-reversed
+//	MULTS_BREV: N 128-bit values x^N, bit-reversed.  Only if N > 1.
+//
+// N is the maximum number of blocks that will be processed per loop iteration,
+// computed using vsetvli.
+//
+// The field convention used by XTS is the same as that of GHASH, but with the
+// bits reversed within each byte.  The zvkg extension provides the vgmul
+// instruction which does multiplication in this field.  Therefore, for tweak
+// computation we use vgmul to do multiplications in parallel, instead of
+// serially multiplying by x using shifting+xoring.  Note that for this to work,
+// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
+.macro	xts_init
+
+	// Load the first tweak T.
+	vsetivli	zero, 4, e32, m1, ta, ma
+	vle32.v		TWEAKS, (TWEAKP)
+
+	// If there's only one block (or no blocks at all), then skip the tweak
+	// sequence computation because (at most) T itself is needed.
+	li		t0, 16
+	ble		LEN, t0, .Linit_single_block\@
+
+	// Save a copy of T bit-reversed in v12.
+	vbrev8.v	v12, TWEAKS
+
+	//
+	// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
+	// that N <= 128.  Though, this code actually requires N < 64 (or
+	// equivalently VLEN < 2048) due to the use of 64-bit intermediate
+	// values here and in the x^N computation later.
+	//
+	vsetvli		VL, LEN32, e32, m4, ta, ma
+	srli		t0, VL, 2	// t0 = N (num blocks)
+	// Generate two sequences, each with N 32-bit values:
+	// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
+	vsetvli		zero, t0, e32, m1, ta, ma
+	vmv.v.i		v0, 1
+	vid.v		v1
+	// Use vzext to zero-extend the sequences to 64 bits.  Reinterpret them
+	// as two sequences, each with 2*N 32-bit values:
+	// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
+	vsetvli		zero, t0, e64, m2, ta, ma
+	vzext.vf2	v2, v0
+	vzext.vf2	v4, v1
+	slli		t1, t0, 1	// t1 = 2*N
+	vsetvli		zero, t1, e32, m2, ta, ma
+	// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
+	// widening to 64 bits per element.  When reinterpreted as N 128-bit
+	// values, this is the needed sequence of 128-bit values 1 << i (x^i).
+	vwsll.vv	v8, v2, v4
+
+	// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
+	// multiply by x^i.  This gives the sequence T*(x^i), bit-reversed.
+	vsetvli		zero, LEN32, e32, m4, ta, ma
+	vmv.v.i		TWEAKS_BREV, 0
+	vaesz.vs	TWEAKS_BREV, v12
+	vbrev8.v	v8, v8
+	vgmul.vv	TWEAKS_BREV, v8
+
+	// Save a copy of the sequence T*(x^i) with the bit reversal undone.
+	vbrev8.v	TWEAKS, TWEAKS_BREV
+
+	// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
+	li		t1, 1
+	sll		t1, t1, t0	// t1 = 1 << N
+	vsetivli	zero, 2, e64, m1, ta, ma
+	vmv.v.i		v0, 0
+	vsetivli	zero, 1, e64, m1, tu, ma
+	vmv.v.x		v0, t1
+	vbrev8.v	v0, v0
+	vsetvli		zero, LEN32, e32, m4, ta, ma
+	vmv.v.i		MULTS_BREV, 0
+	vaesz.vs	MULTS_BREV, v0
+
+	j		.Linit_done\@
+
+.Linit_single_block\@:
+	vbrev8.v	TWEAKS_BREV, TWEAKS
+.Linit_done\@:
+.endm
+
+// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed.  This is
+// the multiplier required to advance the tweak by one.
+.macro	load_x
+	li		t0, 0x40
+	vsetivli	zero, 4, e32, m1, ta, ma
+	vmv.v.i		MULTS_BREV, 0
+	vsetivli	zero, 1, e8, m1, tu, ma
+	vmv.v.x		MULTS_BREV, t0
+.endm
+
+.macro	__aes_xts_crypt	enc, keylen
+	// With 16 < len <= 31, there's no main loop, just ciphertext stealing.
+	beqz		LEN32, .Lcts_without_main_loop\@
+
+	vsetvli		VLMAX, zero, e32, m4, ta, ma
+1:
+	vsetvli		VL, LEN32, e32, m4, ta, ma
+2:
+	// Encrypt or decrypt VL/4 blocks.
+	vle32.v		TMP0, (INP)
+	vxor.vv		TMP0, TMP0, TWEAKS
+	aes_crypt	TMP0, \enc, \keylen
+	vxor.vv		TMP0, TMP0, TWEAKS
+	vse32.v		TMP0, (OUTP)
+
+	// Update the pointers and the remaining length.
+	slli		t0, VL, 2
+	add		INP, INP, t0
+	add		OUTP, OUTP, t0
+	sub		LEN32, LEN32, VL
+
+	// Check whether more blocks remain.
+	beqz		LEN32, .Lmain_loop_done\@
+
+	// Compute the next sequence of tweaks by multiplying the previous
+	// sequence by x^N.  Store the result in both bit-reversed order and
+	// regular order (i.e. with the bit reversal undone).
+	vgmul.vv	TWEAKS_BREV, MULTS_BREV
+	vbrev8.v	TWEAKS, TWEAKS_BREV
+
+	// Since we compute the tweak multipliers x^N in advance, we require
+	// that each iteration process the same length except possibly the last.
+	// This conflicts slightly with the behavior allowed by RISC-V Vector
+	// Extension, where CPUs can select a lower length for both of the last
+	// two iterations.  E.g., vl might take the sequence of values
+	// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
+	// can use x^4 again instead of computing x^3.  Therefore, we explicitly
+	// keep the vl at VLMAX if there is at least VLMAX remaining.
+	bge		LEN32, VLMAX, 2b
+	j		1b
+
+.Lmain_loop_done\@:
+	load_x
+
+	// Compute the next tweak.
+	addi		t0, VL, -4
+	vsetivli	zero, 4, e32, m4, ta, ma
+	vslidedown.vx	TWEAKS_BREV, TWEAKS_BREV, t0	// Extract last tweak
+	vsetivli	zero, 4, e32, m1, ta, ma
+	vgmul.vv	TWEAKS_BREV, MULTS_BREV		// Advance to next tweak
+
+	bnez		TAIL_LEN, .Lcts\@
+
+	// Update *TWEAKP to contain the next tweak.
+	vbrev8.v	TWEAKS, TWEAKS_BREV
+	vse32.v		TWEAKS, (TWEAKP)
+	ret
+
+.Lcts_without_main_loop\@:
+	load_x
+.Lcts\@:
+	// TWEAKS_BREV now contains the next tweak.  Compute the one after that.
+	vsetivli	zero, 4, e32, m1, ta, ma
+	vmv.v.v		TMP0, TWEAKS_BREV
+	vgmul.vv	TMP0, MULTS_BREV
+	// Undo the bit reversal of the next two tweaks and store them in TMP1
+	// and TMP2, such that TMP1 is the first needed and TMP2 the second.
+.if \enc
+	vbrev8.v	TMP1, TWEAKS_BREV
+	vbrev8.v	TMP2, TMP0
+.else
+	vbrev8.v	TMP1, TMP0
+	vbrev8.v	TMP2, TWEAKS_BREV
+.endif
+
+	// Encrypt/decrypt the last full block.
+	vle32.v		TMP0, (INP)
+	vxor.vv		TMP0, TMP0, TMP1
+	aes_crypt	TMP0, \enc, \keylen
+	vxor.vv		TMP0, TMP0, TMP1
+
+	// Swap the first TAIL_LEN bytes of the above result with the tail.
+	// Note that to support in-place encryption/decryption, the load from
+	// the input tail must happen before the store to the output tail.
+	addi		t0, INP, 16
+	addi		t1, OUTP, 16
+	vmv.v.v		TMP3, TMP0
+	vsetvli		zero, TAIL_LEN, e8, m1, tu, ma
+	vle8.v		TMP0, (t0)
+	vse8.v		TMP3, (t1)
+
+	// Encrypt/decrypt again and store the last full block.
+	vsetivli	zero, 4, e32, m1, ta, ma
+	vxor.vv		TMP0, TMP0, TMP2
+	aes_crypt	TMP0, \enc, \keylen
+	vxor.vv		TMP0, TMP0, TMP2
+	vse32.v		TMP0, (OUTP)
+
+	ret
+.endm
+
+.macro	aes_xts_crypt	enc
+
+	// Check whether the length is a multiple of the AES block size.
+	andi		TAIL_LEN, LEN, 15
+	beqz		TAIL_LEN, 1f
+
+	// The length isn't a multiple of the AES block size, so ciphertext
+	// stealing will be required.  Ciphertext stealing involves special
+	// handling of the partial block and the last full block, so subtract
+	// the length of both from the length to be processed in the main loop.
+	sub		LEN, LEN, TAIL_LEN
+	addi		LEN, LEN, -16
+1:
+	srli		LEN32, LEN, 2
+	// LEN and LEN32 now contain the total length of the blocks that will be
+	// processed in the main loop, in bytes and 32-bit words respectively.
+
+	xts_init
+	aes_begin	KEYP, 128f, 192f
+	__aes_xts_crypt	\enc, 256
+128:
+	__aes_xts_crypt	\enc, 128
+192:
+	__aes_xts_crypt	\enc, 192
+.endm
+
+// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
+//					 const u8 *in, u8 *out, size_t len,
+//					 u8 tweak[16]);
+//
+// |key| is the data key.  |tweak| contains the next tweak; the encryption of
+// the original IV with the tweak key was already done.  This function supports
+// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
+// |len| must be a multiple of 16 except on the last call.  If |len| is a
+// multiple of 16, then this function updates |tweak| to contain the next tweak.
+SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
+	aes_xts_crypt	1
+SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)
+
+// Same prototype and calling convention as the encryption function
+SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
+	aes_xts_crypt	0
+SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)