1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
|
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 21 Feb 2014 17:24:04 +0100
Subject: [PATCH 221/354] crypto: Reduce preempt disabled regions, more algos
Origin: https://git.kernel.org/cgit/linux/kernel/git/rt/linux-stable-rt.git/commit?id=0dc4bb0d8f3f27f483b63cc5b0f236d42c1477fd
Don Estabrook reported
| kernel: WARNING: CPU: 2 PID: 858 at kernel/sched/core.c:2428 migrate_disable+0xed/0x100()
| kernel: WARNING: CPU: 2 PID: 858 at kernel/sched/core.c:2462 migrate_enable+0x17b/0x200()
| kernel: WARNING: CPU: 3 PID: 865 at kernel/sched/core.c:2428 migrate_disable+0xed/0x100()
and his backtrace showed some crypto functions which looked fine.
The problem is the following sequence:
glue_xts_crypt_128bit()
{
blkcipher_walk_virt(); /* normal migrate_disable() */
glue_fpu_begin(); /* get atomic */
while (nbytes) {
__glue_xts_crypt_128bit();
blkcipher_walk_done(); /* with nbytes = 0, migrate_enable()
* while we are atomic */
};
glue_fpu_end() /* no longer atomic */
}
and this is why the counter get out of sync and the warning is printed.
The other problem is that we are non-preemptible between
glue_fpu_begin() and glue_fpu_end() and the latency grows. To fix this,
I shorten the FPU off region and ensure blkcipher_walk_done() is called
with preemption enabled. This might hurt the performance because we now
enable/disable the FPU state more often but we gain lower latency and
the bug is gone.
Reported-by: Don Estabrook <don.estabrook@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/x86/crypto/cast5_avx_glue.c | 21 +++++++++------------
arch/x86/crypto/glue_helper.c | 31 ++++++++++++++++---------------
2 files changed, 25 insertions(+), 27 deletions(-)
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 41034745d6a2..d4bf7fc02ee7 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -61,7 +61,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
static int ecb_crypt(struct skcipher_request *req, bool enc)
{
- bool fpu_enabled = false;
+ bool fpu_enabled;
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
@@ -76,7 +76,7 @@ static int ecb_crypt(struct skcipher_request *req, bool enc)
u8 *wsrc = walk.src.virt.addr;
u8 *wdst = walk.dst.virt.addr;
- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
+ fpu_enabled = cast5_fpu_begin(false, &walk, nbytes);
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
@@ -105,10 +105,9 @@ static int ecb_crypt(struct skcipher_request *req, bool enc)
} while (nbytes >= bsize);
done:
+ cast5_fpu_end(fpu_enabled);
err = skcipher_walk_done(&walk, nbytes);
}
-
- cast5_fpu_end(fpu_enabled);
return err;
}
@@ -212,7 +211,7 @@ static int cbc_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- bool fpu_enabled = false;
+ bool fpu_enabled;
struct skcipher_walk walk;
unsigned int nbytes;
int err;
@@ -220,12 +219,11 @@ static int cbc_decrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
while ((nbytes = walk.nbytes)) {
- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
+ fpu_enabled = cast5_fpu_begin(false, &walk, nbytes);
nbytes = __cbc_decrypt(ctx, &walk);
+ cast5_fpu_end(fpu_enabled);
err = skcipher_walk_done(&walk, nbytes);
}
-
- cast5_fpu_end(fpu_enabled);
return err;
}
@@ -292,7 +290,7 @@ static int ctr_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- bool fpu_enabled = false;
+ bool fpu_enabled;
struct skcipher_walk walk;
unsigned int nbytes;
int err;
@@ -300,13 +298,12 @@ static int ctr_crypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
+ fpu_enabled = cast5_fpu_begin(false, &walk, nbytes);
nbytes = __ctr_crypt(&walk, ctx);
+ cast5_fpu_end(fpu_enabled);
err = skcipher_walk_done(&walk, nbytes);
}
- cast5_fpu_end(fpu_enabled);
-
if (walk.nbytes) {
ctr_crypt_final(&walk, ctx);
err = skcipher_walk_done(&walk, 0);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index a78ef99a9981..dac489a1c4da 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -38,7 +38,7 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx,
void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
const unsigned int bsize = 128 / 8;
struct skcipher_walk walk;
- bool fpu_enabled = false;
+ bool fpu_enabled;
unsigned int nbytes;
int err;
@@ -51,7 +51,7 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx,
unsigned int i;
fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled, nbytes);
+ &walk, false, nbytes);
for (i = 0; i < gctx->num_funcs; i++) {
func_bytes = bsize * gctx->funcs[i].num_blocks;
@@ -69,10 +69,9 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx,
if (nbytes < bsize)
break;
}
+ glue_fpu_end(fpu_enabled);
err = skcipher_walk_done(&walk, nbytes);
}
-
- glue_fpu_end(fpu_enabled);
return err;
}
EXPORT_SYMBOL_GPL(glue_ecb_req_128bit);
@@ -115,7 +114,7 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
const unsigned int bsize = 128 / 8;
struct skcipher_walk walk;
- bool fpu_enabled = false;
+ bool fpu_enabled;
unsigned int nbytes;
int err;
@@ -129,7 +128,7 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
u128 last_iv;
fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled, nbytes);
+ &walk, false, nbytes);
/* Start of the last block. */
src += nbytes / bsize - 1;
dst += nbytes / bsize - 1;
@@ -161,10 +160,10 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
done:
u128_xor(dst, dst, (u128 *)walk.iv);
*(u128 *)walk.iv = last_iv;
+ glue_fpu_end(fpu_enabled);
err = skcipher_walk_done(&walk, nbytes);
}
- glue_fpu_end(fpu_enabled);
return err;
}
EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit);
@@ -175,7 +174,7 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
const unsigned int bsize = 128 / 8;
struct skcipher_walk walk;
- bool fpu_enabled = false;
+ bool fpu_enabled;
unsigned int nbytes;
int err;
@@ -189,7 +188,7 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
le128 ctrblk;
fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled, nbytes);
+ &walk, false, nbytes);
be128_to_le128(&ctrblk, (be128 *)walk.iv);
@@ -213,11 +212,10 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
}
le128_to_be128((be128 *)walk.iv, &ctrblk);
+ glue_fpu_end(fpu_enabled);
err = skcipher_walk_done(&walk, nbytes);
}
- glue_fpu_end(fpu_enabled);
-
if (nbytes) {
le128 ctrblk;
u128 tmp;
@@ -278,7 +276,7 @@ int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
{
const unsigned int bsize = 128 / 8;
struct skcipher_walk walk;
- bool fpu_enabled = false;
+ bool fpu_enabled;
unsigned int nbytes;
int err;
@@ -289,21 +287,24 @@ int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
/* set minimum length to bsize, for tweak_fn */
fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled,
+ &walk, false,
nbytes < bsize ? bsize : nbytes);
/* calculate first value of T */
tweak_fn(tweak_ctx, walk.iv, walk.iv);
while (nbytes) {
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled,
+ nbytes < bsize ? bsize : nbytes);
nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk);
+ glue_fpu_end(fpu_enabled);
+ fpu_enabled = false;
err = skcipher_walk_done(&walk, nbytes);
nbytes = walk.nbytes;
}
- glue_fpu_end(fpu_enabled);
-
return err;
}
EXPORT_SYMBOL_GPL(glue_xts_req_128bit);
|