1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
|
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
// - RISC-V Vector AES block cipher extension ('Zvkned')
// - RISC-V Vector Bit-manipulation extension ('Zvbb')
// - RISC-V Vector GCM/GMAC extension ('Zvkg')
#include <linux/linkage.h>
.text
.option arch, +zvkned, +zvbb, +zvkg
#include "aes-macros.S"
#define KEYP a0
#define INP a1
#define OUTP a2
#define LEN a3
#define TWEAKP a4
#define LEN32 a5
#define TAIL_LEN a6
#define VL a7
#define VLMAX t4
// v1-v15 contain the AES round keys, but they are used for temporaries before
// the AES round keys have been loaded.
#define TWEAKS v16 // LMUL=4 (most of the time)
#define TWEAKS_BREV v20 // LMUL=4 (most of the time)
#define MULTS_BREV v24 // LMUL=4 (most of the time)
#define TMP0 v28
#define TMP1 v29
#define TMP2 v30
#define TMP3 v31
// xts_init initializes the following values:
//
// TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
// TWEAKS_BREV: same as TWEAKS, but bit-reversed
// MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.
//
// N is the maximum number of blocks that will be processed per loop iteration,
// computed using vsetvli.
//
// The field convention used by XTS is the same as that of GHASH, but with the
// bits reversed within each byte. The zvkg extension provides the vgmul
// instruction which does multiplication in this field. Therefore, for tweak
// computation we use vgmul to do multiplications in parallel, instead of
// serially multiplying by x using shifting+xoring. Note that for this to work,
// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
.macro xts_init
// Load the first tweak T.
vsetivli zero, 4, e32, m1, ta, ma
vle32.v TWEAKS, (TWEAKP)
// If there's only one block (or no blocks at all), then skip the tweak
// sequence computation because (at most) T itself is needed.
li t0, 16
ble LEN, t0, .Linit_single_block\@
// Save a copy of T bit-reversed in v12.
vbrev8.v v12, TWEAKS
//
// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
// that N <= 128. Though, this code actually requires N < 64 (or
// equivalently VLEN < 2048) due to the use of 64-bit intermediate
// values here and in the x^N computation later.
//
vsetvli VL, LEN32, e32, m4, ta, ma
srli t0, VL, 2 // t0 = N (num blocks)
// Generate two sequences, each with N 32-bit values:
// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
vsetvli zero, t0, e32, m1, ta, ma
vmv.v.i v0, 1
vid.v v1
// Use vzext to zero-extend the sequences to 64 bits. Reinterpret them
// as two sequences, each with 2*N 32-bit values:
// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
vsetvli zero, t0, e64, m2, ta, ma
vzext.vf2 v2, v0
vzext.vf2 v4, v1
slli t1, t0, 1 // t1 = 2*N
vsetvli zero, t1, e32, m2, ta, ma
// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
// widening to 64 bits per element. When reinterpreted as N 128-bit
// values, this is the needed sequence of 128-bit values 1 << i (x^i).
vwsll.vv v8, v2, v4
// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
// multiply by x^i. This gives the sequence T*(x^i), bit-reversed.
vsetvli zero, LEN32, e32, m4, ta, ma
vmv.v.i TWEAKS_BREV, 0
vaesz.vs TWEAKS_BREV, v12
vbrev8.v v8, v8
vgmul.vv TWEAKS_BREV, v8
// Save a copy of the sequence T*(x^i) with the bit reversal undone.
vbrev8.v TWEAKS, TWEAKS_BREV
// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
li t1, 1
sll t1, t1, t0 // t1 = 1 << N
vsetivli zero, 2, e64, m1, ta, ma
vmv.v.i v0, 0
vsetivli zero, 1, e64, m1, tu, ma
vmv.v.x v0, t1
vbrev8.v v0, v0
vsetvli zero, LEN32, e32, m4, ta, ma
vmv.v.i MULTS_BREV, 0
vaesz.vs MULTS_BREV, v0
j .Linit_done\@
.Linit_single_block\@:
vbrev8.v TWEAKS_BREV, TWEAKS
.Linit_done\@:
.endm
// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is
// the multiplier required to advance the tweak by one.
.macro load_x
li t0, 0x40
vsetivli zero, 4, e32, m1, ta, ma
vmv.v.i MULTS_BREV, 0
vsetivli zero, 1, e8, m1, tu, ma
vmv.v.x MULTS_BREV, t0
.endm
.macro __aes_xts_crypt enc, keylen
// With 16 < len <= 31, there's no main loop, just ciphertext stealing.
beqz LEN32, .Lcts_without_main_loop\@
vsetvli VLMAX, zero, e32, m4, ta, ma
1:
vsetvli VL, LEN32, e32, m4, ta, ma
2:
// Encrypt or decrypt VL/4 blocks.
vle32.v TMP0, (INP)
vxor.vv TMP0, TMP0, TWEAKS
aes_crypt TMP0, \enc, \keylen
vxor.vv TMP0, TMP0, TWEAKS
vse32.v TMP0, (OUTP)
// Update the pointers and the remaining length.
slli t0, VL, 2
add INP, INP, t0
add OUTP, OUTP, t0
sub LEN32, LEN32, VL
// Check whether more blocks remain.
beqz LEN32, .Lmain_loop_done\@
// Compute the next sequence of tweaks by multiplying the previous
// sequence by x^N. Store the result in both bit-reversed order and
// regular order (i.e. with the bit reversal undone).
vgmul.vv TWEAKS_BREV, MULTS_BREV
vbrev8.v TWEAKS, TWEAKS_BREV
// Since we compute the tweak multipliers x^N in advance, we require
// that each iteration process the same length except possibly the last.
// This conflicts slightly with the behavior allowed by RISC-V Vector
// Extension, where CPUs can select a lower length for both of the last
// two iterations. E.g., vl might take the sequence of values
// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
// can use x^4 again instead of computing x^3. Therefore, we explicitly
// keep the vl at VLMAX if there is at least VLMAX remaining.
bge LEN32, VLMAX, 2b
j 1b
.Lmain_loop_done\@:
load_x
// Compute the next tweak.
addi t0, VL, -4
vsetivli zero, 4, e32, m4, ta, ma
vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak
vsetivli zero, 4, e32, m1, ta, ma
vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak
bnez TAIL_LEN, .Lcts\@
// Update *TWEAKP to contain the next tweak.
vbrev8.v TWEAKS, TWEAKS_BREV
vse32.v TWEAKS, (TWEAKP)
ret
.Lcts_without_main_loop\@:
load_x
.Lcts\@:
// TWEAKS_BREV now contains the next tweak. Compute the one after that.
vsetivli zero, 4, e32, m1, ta, ma
vmv.v.v TMP0, TWEAKS_BREV
vgmul.vv TMP0, MULTS_BREV
// Undo the bit reversal of the next two tweaks and store them in TMP1
// and TMP2, such that TMP1 is the first needed and TMP2 the second.
.if \enc
vbrev8.v TMP1, TWEAKS_BREV
vbrev8.v TMP2, TMP0
.else
vbrev8.v TMP1, TMP0
vbrev8.v TMP2, TWEAKS_BREV
.endif
// Encrypt/decrypt the last full block.
vle32.v TMP0, (INP)
vxor.vv TMP0, TMP0, TMP1
aes_crypt TMP0, \enc, \keylen
vxor.vv TMP0, TMP0, TMP1
// Swap the first TAIL_LEN bytes of the above result with the tail.
// Note that to support in-place encryption/decryption, the load from
// the input tail must happen before the store to the output tail.
addi t0, INP, 16
addi t1, OUTP, 16
vmv.v.v TMP3, TMP0
vsetvli zero, TAIL_LEN, e8, m1, tu, ma
vle8.v TMP0, (t0)
vse8.v TMP3, (t1)
// Encrypt/decrypt again and store the last full block.
vsetivli zero, 4, e32, m1, ta, ma
vxor.vv TMP0, TMP0, TMP2
aes_crypt TMP0, \enc, \keylen
vxor.vv TMP0, TMP0, TMP2
vse32.v TMP0, (OUTP)
ret
.endm
.macro aes_xts_crypt enc
// Check whether the length is a multiple of the AES block size.
andi TAIL_LEN, LEN, 15
beqz TAIL_LEN, 1f
// The length isn't a multiple of the AES block size, so ciphertext
// stealing will be required. Ciphertext stealing involves special
// handling of the partial block and the last full block, so subtract
// the length of both from the length to be processed in the main loop.
sub LEN, LEN, TAIL_LEN
addi LEN, LEN, -16
1:
srli LEN32, LEN, 2
// LEN and LEN32 now contain the total length of the blocks that will be
// processed in the main loop, in bytes and 32-bit words respectively.
xts_init
aes_begin KEYP, 128f, 192f
__aes_xts_crypt \enc, 256
128:
__aes_xts_crypt \enc, 128
192:
__aes_xts_crypt \enc, 192
.endm
// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
// const u8 *in, u8 *out, size_t len,
// u8 tweak[16]);
//
// |key| is the data key. |tweak| contains the next tweak; the encryption of
// the original IV with the tweak key was already done. This function supports
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
// |len| must be a multiple of 16 except on the last call. If |len| is a
// multiple of 16, then this function updates |tweak| to contain the next tweak.
SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)
// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)
|