arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312

/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
// - RISC-V Vector AES block cipher extension ('Zvkned')
// - RISC-V Vector Bit-manipulation extension ('Zvbb')
// - RISC-V Vector GCM/GMAC extension ('Zvkg')

#include <linux/linkage.h>

.text
.option arch, +zvkned, +zvbb, +zvkg

#include "aes-macros.S"

#define KEYP		a0
#define INP		a1
#define OUTP		a2
#define LEN		a3
#define TWEAKP		a4

#define LEN32		a5
#define TAIL_LEN	a6
#define VL		a7
#define VLMAX		t4

// v1-v15 contain the AES round keys, but they are used for temporaries before
// the AES round keys have been loaded.
#define TWEAKS		v16	// LMUL=4 (most of the time)
#define TWEAKS_BREV	v20	// LMUL=4 (most of the time)
#define MULTS_BREV	v24	// LMUL=4 (most of the time)
#define TMP0		v28
#define TMP1		v29
#define TMP2		v30
#define TMP3		v31

// xts_init initializes the following values:
//
//	TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
//	TWEAKS_BREV: same as TWEAKS, but bit-reversed
//	MULTS_BREV: N 128-bit values x^N, bit-reversed.  Only if N > 1.
//
// N is the maximum number of blocks that will be processed per loop iteration,
// computed using vsetvli.
//
// The field convention used by XTS is the same as that of GHASH, but with the
// bits reversed within each byte.  The zvkg extension provides the vgmul
// instruction which does multiplication in this field.  Therefore, for tweak
// computation we use vgmul to do multiplications in parallel, instead of
// serially multiplying by x using shifting+xoring.  Note that for this to work,
// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
.macro	xts_init

	// Load the first tweak T.
	vsetivli	zero, 4, e32, m1, ta, ma
	vle32.v		TWEAKS, (TWEAKP)

	// If there's only one block (or no blocks at all), then skip the tweak
	// sequence computation because (at most) T itself is needed.
	li		t0, 16
	ble		LEN, t0, .Linit_single_block\@

	// Save a copy of T bit-reversed in v12.
	vbrev8.v	v12, TWEAKS

	//
	// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
	// that N <= 128.  Though, this code actually requires N < 64 (or
	// equivalently VLEN < 2048) due to the use of 64-bit intermediate
	// values here and in the x^N computation later.
	//
	vsetvli		VL, LEN32, e32, m4, ta, ma
	srli		t0, VL, 2	// t0 = N (num blocks)
	// Generate two sequences, each with N 32-bit values:
	// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
	vsetvli		zero, t0, e32, m1, ta, ma
	vmv.v.i		v0, 1
	vid.v		v1
	// Use vzext to zero-extend the sequences to 64 bits.  Reinterpret them
	// as two sequences, each with 2*N 32-bit values:
	// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
	vsetvli		zero, t0, e64, m2, ta, ma
	vzext.vf2	v2, v0
	vzext.vf2	v4, v1
	slli		t1, t0, 1	// t1 = 2*N
	vsetvli		zero, t1, e32, m2, ta, ma
	// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
	// widening to 64 bits per element.  When reinterpreted as N 128-bit
	// values, this is the needed sequence of 128-bit values 1 << i (x^i).
	vwsll.vv	v8, v2, v4

	// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
	// multiply by x^i.  This gives the sequence T*(x^i), bit-reversed.
	vsetvli		zero, LEN32, e32, m4, ta, ma
	vmv.v.i		TWEAKS_BREV, 0
	vaesz.vs	TWEAKS_BREV, v12
	vbrev8.v	v8, v8
	vgmul.vv	TWEAKS_BREV, v8

	// Save a copy of the sequence T*(x^i) with the bit reversal undone.
	vbrev8.v	TWEAKS, TWEAKS_BREV

	// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
	li		t1, 1
	sll		t1, t1, t0	// t1 = 1 << N
	vsetivli	zero, 2, e64, m1, ta, ma
	vmv.v.i		v0, 0
	vsetivli	zero, 1, e64, m1, tu, ma
	vmv.v.x		v0, t1
	vbrev8.v	v0, v0
	vsetvli		zero, LEN32, e32, m4, ta, ma
	vmv.v.i		MULTS_BREV, 0
	vaesz.vs	MULTS_BREV, v0

	j		.Linit_done\@

.Linit_single_block\@:
	vbrev8.v	TWEAKS_BREV, TWEAKS
.Linit_done\@:
.endm

// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed.  This is
// the multiplier required to advance the tweak by one.
.macro	load_x
	li		t0, 0x40
	vsetivli	zero, 4, e32, m1, ta, ma
	vmv.v.i		MULTS_BREV, 0
	vsetivli	zero, 1, e8, m1, tu, ma
	vmv.v.x		MULTS_BREV, t0
.endm

.macro	__aes_xts_crypt	enc, keylen
	// With 16 < len <= 31, there's no main loop, just ciphertext stealing.
	beqz		LEN32, .Lcts_without_main_loop\@

	vsetvli		VLMAX, zero, e32, m4, ta, ma
1:
	vsetvli		VL, LEN32, e32, m4, ta, ma
2:
	// Encrypt or decrypt VL/4 blocks.
	vle32.v		TMP0, (INP)
	vxor.vv		TMP0, TMP0, TWEAKS
	aes_crypt	TMP0, \enc, \keylen
	vxor.vv		TMP0, TMP0, TWEAKS
	vse32.v		TMP0, (OUTP)

	// Update the pointers and the remaining length.
	slli		t0, VL, 2
	add		INP, INP, t0
	add		OUTP, OUTP, t0
	sub		LEN32, LEN32, VL

	// Check whether more blocks remain.
	beqz		LEN32, .Lmain_loop_done\@

	// Compute the next sequence of tweaks by multiplying the previous
	// sequence by x^N.  Store the result in both bit-reversed order and
	// regular order (i.e. with the bit reversal undone).
	vgmul.vv	TWEAKS_BREV, MULTS_BREV
	vbrev8.v	TWEAKS, TWEAKS_BREV

	// Since we compute the tweak multipliers x^N in advance, we require
	// that each iteration process the same length except possibly the last.
	// This conflicts slightly with the behavior allowed by RISC-V Vector
	// Extension, where CPUs can select a lower length for both of the last
	// two iterations.  E.g., vl might take the sequence of values
	// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
	// can use x^4 again instead of computing x^3.  Therefore, we explicitly
	// keep the vl at VLMAX if there is at least VLMAX remaining.
	bge		LEN32, VLMAX, 2b
	j		1b

.Lmain_loop_done\@:
	load_x

	// Compute the next tweak.
	addi		t0, VL, -4
	vsetivli	zero, 4, e32, m4, ta, ma
	vslidedown.vx	TWEAKS_BREV, TWEAKS_BREV, t0	// Extract last tweak
	vsetivli	zero, 4, e32, m1, ta, ma
	vgmul.vv	TWEAKS_BREV, MULTS_BREV		// Advance to next tweak

	bnez		TAIL_LEN, .Lcts\@

	// Update *TWEAKP to contain the next tweak.
	vbrev8.v	TWEAKS, TWEAKS_BREV
	vse32.v		TWEAKS, (TWEAKP)
	ret

.Lcts_without_main_loop\@:
	load_x
.Lcts\@:
	// TWEAKS_BREV now contains the next tweak.  Compute the one after that.
	vsetivli	zero, 4, e32, m1, ta, ma
	vmv.v.v		TMP0, TWEAKS_BREV
	vgmul.vv	TMP0, MULTS_BREV
	// Undo the bit reversal of the next two tweaks and store them in TMP1
	// and TMP2, such that TMP1 is the first needed and TMP2 the second.
.if \enc
	vbrev8.v	TMP1, TWEAKS_BREV
	vbrev8.v	TMP2, TMP0
.else
	vbrev8.v	TMP1, TMP0
	vbrev8.v	TMP2, TWEAKS_BREV
.endif

	// Encrypt/decrypt the last full block.
	vle32.v		TMP0, (INP)
	vxor.vv		TMP0, TMP0, TMP1
	aes_crypt	TMP0, \enc, \keylen
	vxor.vv		TMP0, TMP0, TMP1

	// Swap the first TAIL_LEN bytes of the above result with the tail.
	// Note that to support in-place encryption/decryption, the load from
	// the input tail must happen before the store to the output tail.
	addi		t0, INP, 16
	addi		t1, OUTP, 16
	vmv.v.v		TMP3, TMP0
	vsetvli		zero, TAIL_LEN, e8, m1, tu, ma
	vle8.v		TMP0, (t0)
	vse8.v		TMP3, (t1)

	// Encrypt/decrypt again and store the last full block.
	vsetivli	zero, 4, e32, m1, ta, ma
	vxor.vv		TMP0, TMP0, TMP2
	aes_crypt	TMP0, \enc, \keylen
	vxor.vv		TMP0, TMP0, TMP2
	vse32.v		TMP0, (OUTP)

	ret
.endm

.macro	aes_xts_crypt	enc

	// Check whether the length is a multiple of the AES block size.
	andi		TAIL_LEN, LEN, 15
	beqz		TAIL_LEN, 1f

	// The length isn't a multiple of the AES block size, so ciphertext
	// stealing will be required.  Ciphertext stealing involves special
	// handling of the partial block and the last full block, so subtract
	// the length of both from the length to be processed in the main loop.
	sub		LEN, LEN, TAIL_LEN
	addi		LEN, LEN, -16
1:
	srli		LEN32, LEN, 2
	// LEN and LEN32 now contain the total length of the blocks that will be
	// processed in the main loop, in bytes and 32-bit words respectively.

	xts_init
	aes_begin	KEYP, 128f, 192f
	__aes_xts_crypt	\enc, 256
128:
	__aes_xts_crypt	\enc, 128
192:
	__aes_xts_crypt	\enc, 192
.endm

// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
//					 const u8 *in, u8 *out, size_t len,
//					 u8 tweak[16]);
//
// |key| is the data key.  |tweak| contains the next tweak; the encryption of
// the original IV with the tweak key was already done.  This function supports
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
// |len| must be a multiple of 16 except on the last call.  If |len| is a
// multiple of 16, then this function updates |tweak| to contain the next tweak.
SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
	aes_xts_crypt	1
SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)

// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
	aes_xts_crypt	0
SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)