summaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aes-xts-avx-x86_64.S845
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S467
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c400
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S253
5 files changed, 1270 insertions, 698 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9aa46093c9..9c5ce56137 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,7 +48,8 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \
+ aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
new file mode 100644
index 0000000000..48f97b79f7
--- /dev/null
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -0,0 +1,845 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES-XTS for modern x86_64 CPUs
+ *
+ * Copyright 2024 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+/*
+ * This file implements AES-XTS for modern x86_64 CPUs. To handle the
+ * complexities of coding for x86 SIMD, e.g. where every vector length needs
+ * different code, it uses a macro to generate several implementations that
+ * share similar source code but are targeted at different CPUs, listed below:
+ *
+ * AES-NI + AVX
+ * - 128-bit vectors (1 AES block per vector)
+ * - VEX-coded instructions
+ * - xmm0-xmm15
+ * - This is for older CPUs that lack VAES but do have AVX.
+ *
+ * VAES + VPCLMULQDQ + AVX2
+ * - 256-bit vectors (2 AES blocks per vector)
+ * - VEX-coded instructions
+ * - ymm0-ymm15
+ * - This is for CPUs that have VAES but lack AVX512 or AVX10,
+ * e.g. Intel's Alder Lake and AMD's Zen 3.
+ *
+ * VAES + VPCLMULQDQ + AVX10/256 + BMI2
+ * - 256-bit vectors (2 AES blocks per vector)
+ * - EVEX-coded instructions
+ * - ymm0-ymm31
+ * - This is for CPUs that have AVX512 but where using zmm registers causes
+ * downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
+ * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
+ * To avoid confusion with 512-bit, we just write AVX10/256.
+ *
+ * VAES + VPCLMULQDQ + AVX10/512 + BMI2
+ * - Same as the previous one, but upgrades to 512-bit vectors
+ * (4 AES blocks per vector) in zmm0-zmm31.
+ * - This is for CPUs that have good AVX512 or AVX10/512 support.
+ *
+ * This file doesn't have an implementation for AES-NI alone (without AVX), as
+ * the lack of VEX would make all the assembly code different.
+ *
+ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
+ * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be
+ * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might
+ * need to start also providing an implementation using VAES alone.
+ *
+ * The AES-XTS implementations in this file support everything required by the
+ * crypto API, including support for arbitrary input lengths and multi-part
+ * processing. However, they are most heavily optimized for the common case of
+ * power-of-2 length inputs that are processed in a single part (disk sectors).
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.section .rodata
+.p2align 4
+.Lgf_poly:
+ // The low 64 bits of this value represent the polynomial x^7 + x^2 + x
+ // + 1. It is the value that must be XOR'd into the low 64 bits of the
+ // tweak each time a 1 is carried out of the high 64 bits.
+ //
+ // The high 64 bits of this value is just the internal carry bit that
+ // exists when there's a carry out of the low 64 bits of the tweak.
+ .quad 0x87, 1
+
+ // This table contains constants for vpshufb and vpblendvb, used to
+ // handle variable byte shifts and blending during ciphertext stealing
+ // on CPUs that don't support AVX10-style masking.
+.Lcts_permute_table:
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+.text
+
+// Function parameters
+.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
+ // advanced to point to 7th-from-last round key
+.set SRC, %rsi // Pointer to next source data
+.set DST, %rdx // Pointer to next destination data
+.set LEN, %ecx // Remaining length in bytes
+.set LEN8, %cl
+.set LEN64, %rcx
+.set TWEAK, %r8 // Pointer to next tweak
+
+// %rax holds the AES key length in bytes.
+.set KEYLEN, %eax
+.set KEYLEN64, %rax
+
+// %r9-r11 are available as temporaries.
+
+.macro _define_Vi i
+.if VL == 16
+ .set V\i, %xmm\i
+.elseif VL == 32
+ .set V\i, %ymm\i
+.elseif VL == 64
+ .set V\i, %zmm\i
+.else
+ .error "Unsupported Vector Length (VL)"
+.endif
+.endm
+
+.macro _define_aliases
+ // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
+ // are available, that map to the xmm, ymm, or zmm registers according
+ // to the selected Vector Length (VL).
+ _define_Vi 0
+ _define_Vi 1
+ _define_Vi 2
+ _define_Vi 3
+ _define_Vi 4
+ _define_Vi 5
+ _define_Vi 6
+ _define_Vi 7
+ _define_Vi 8
+ _define_Vi 9
+ _define_Vi 10
+ _define_Vi 11
+ _define_Vi 12
+ _define_Vi 13
+ _define_Vi 14
+ _define_Vi 15
+.if USE_AVX10
+ _define_Vi 16
+ _define_Vi 17
+ _define_Vi 18
+ _define_Vi 19
+ _define_Vi 20
+ _define_Vi 21
+ _define_Vi 22
+ _define_Vi 23
+ _define_Vi 24
+ _define_Vi 25
+ _define_Vi 26
+ _define_Vi 27
+ _define_Vi 28
+ _define_Vi 29
+ _define_Vi 30
+ _define_Vi 31
+.endif
+
+ // V0-V3 hold the data blocks during the main loop, or temporary values
+ // otherwise. V4-V5 hold temporary values.
+
+ // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak.
+ .set TWEAK0_XMM, %xmm6
+ .set TWEAK0, V6
+ .set TWEAK1_XMM, %xmm7
+ .set TWEAK1, V7
+ .set TWEAK2, V8
+ .set TWEAK3, V9
+
+ // V10-V13 are used for computing the next values of TWEAK[0-3].
+ .set NEXT_TWEAK0, V10
+ .set NEXT_TWEAK1, V11
+ .set NEXT_TWEAK2, V12
+ .set NEXT_TWEAK3, V13
+
+ // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
+ .set GF_POLY_XMM, %xmm14
+ .set GF_POLY, V14
+
+ // V15 holds the key for AES "round 0", copied to all 128-bit lanes.
+ .set KEY0_XMM, %xmm15
+ .set KEY0, V15
+
+ // If 32 SIMD registers are available, then V16-V29 hold the remaining
+ // AES round keys, copied to all 128-bit lanes.
+ //
+ // AES-128, AES-192, and AES-256 use different numbers of round keys.
+ // To allow handling all three variants efficiently, we align the round
+ // keys to the *end* of this register range. I.e., AES-128 uses
+ // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
+ // (All also use KEY0 for the XOR-only "round" at the beginning.)
+.if USE_AVX10
+ .set KEY1_XMM, %xmm16
+ .set KEY1, V16
+ .set KEY2_XMM, %xmm17
+ .set KEY2, V17
+ .set KEY3_XMM, %xmm18
+ .set KEY3, V18
+ .set KEY4_XMM, %xmm19
+ .set KEY4, V19
+ .set KEY5_XMM, %xmm20
+ .set KEY5, V20
+ .set KEY6_XMM, %xmm21
+ .set KEY6, V21
+ .set KEY7_XMM, %xmm22
+ .set KEY7, V22
+ .set KEY8_XMM, %xmm23
+ .set KEY8, V23
+ .set KEY9_XMM, %xmm24
+ .set KEY9, V24
+ .set KEY10_XMM, %xmm25
+ .set KEY10, V25
+ .set KEY11_XMM, %xmm26
+ .set KEY11, V26
+ .set KEY12_XMM, %xmm27
+ .set KEY12, V27
+ .set KEY13_XMM, %xmm28
+ .set KEY13, V28
+ .set KEY14_XMM, %xmm29
+ .set KEY14, V29
+.endif
+ // V30-V31 are currently unused.
+.endm
+
+// Move a vector between memory and a register.
+.macro _vmovdqu src, dst
+.if VL < 64
+ vmovdqu \src, \dst
+.else
+ vmovdqu8 \src, \dst
+.endif
+.endm
+
+// Broadcast a 128-bit value into a vector.
+.macro _vbroadcast128 src, dst
+.if VL == 16 && !USE_AVX10
+ vmovdqu \src, \dst
+.elseif VL == 32 && !USE_AVX10
+ vbroadcasti128 \src, \dst
+.else
+ vbroadcasti32x4 \src, \dst
+.endif
+.endm
+
+// XOR two vectors together.
+.macro _vpxor src1, src2, dst
+.if USE_AVX10
+ vpxord \src1, \src2, \dst
+.else
+ vpxor \src1, \src2, \dst
+.endif
+.endm
+
+// XOR three vectors together.
+.macro _xor3 src1, src2, src3_and_dst
+.if USE_AVX10
+ // vpternlogd with immediate 0x96 is a three-argument XOR.
+ vpternlogd $0x96, \src1, \src2, \src3_and_dst
+.else
+ vpxor \src1, \src3_and_dst, \src3_and_dst
+ vpxor \src2, \src3_and_dst, \src3_and_dst
+.endif
+.endm
+
+// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
+// (by multiplying by the polynomial 'x') and write it to \dst.
+.macro _next_tweak src, tmp, dst
+ vpshufd $0x13, \src, \tmp
+ vpaddq \src, \src, \dst
+ vpsrad $31, \tmp, \tmp
+ vpand GF_POLY_XMM, \tmp, \tmp
+ vpxor \tmp, \dst, \dst
+.endm
+
+// Given the XTS tweak(s) in the vector \src, compute the next vector of
+// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
+//
+// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
+// all tweaks in the vector in parallel. If VL=16, we just do the regular
+// computation without vpclmulqdq, as it's the faster method for a single tweak.
+.macro _next_tweakvec src, tmp1, tmp2, dst
+.if VL == 16
+ _next_tweak \src, \tmp1, \dst
+.else
+ vpsrlq $64 - VL/16, \src, \tmp1
+ vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2
+ vpslldq $8, \tmp1, \tmp1
+ vpsllq $VL/16, \src, \dst
+ _xor3 \tmp1, \tmp2, \dst
+.endif
+.endm
+
+// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
+// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
+.macro _compute_first_set_of_tweaks
+ vmovdqu (TWEAK), TWEAK0_XMM
+ _vbroadcast128 .Lgf_poly(%rip), GF_POLY
+.if VL == 16
+ // With VL=16, multiplying by x serially is fastest.
+ _next_tweak TWEAK0, %xmm0, TWEAK1
+ _next_tweak TWEAK1, %xmm0, TWEAK2
+ _next_tweak TWEAK2, %xmm0, TWEAK3
+.else
+.if VL == 32
+ // Compute the second block of TWEAK0.
+ _next_tweak TWEAK0_XMM, %xmm0, %xmm1
+ vinserti128 $1, %xmm1, TWEAK0, TWEAK0
+.elseif VL == 64
+ // Compute the remaining blocks of TWEAK0.
+ _next_tweak TWEAK0_XMM, %xmm0, %xmm1
+ _next_tweak %xmm1, %xmm0, %xmm2
+ _next_tweak %xmm2, %xmm0, %xmm3
+ vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
+ vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
+ vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
+.endif
+ // Compute TWEAK[1-3] from TWEAK0.
+ vpsrlq $64 - 1*VL/16, TWEAK0, V0
+ vpsrlq $64 - 2*VL/16, TWEAK0, V2
+ vpsrlq $64 - 3*VL/16, TWEAK0, V4
+ vpclmulqdq $0x01, GF_POLY, V0, V1
+ vpclmulqdq $0x01, GF_POLY, V2, V3
+ vpclmulqdq $0x01, GF_POLY, V4, V5
+ vpslldq $8, V0, V0
+ vpslldq $8, V2, V2
+ vpslldq $8, V4, V4
+ vpsllq $1*VL/16, TWEAK0, TWEAK1
+ vpsllq $2*VL/16, TWEAK0, TWEAK2
+ vpsllq $3*VL/16, TWEAK0, TWEAK3
+.if USE_AVX10
+ vpternlogd $0x96, V0, V1, TWEAK1
+ vpternlogd $0x96, V2, V3, TWEAK2
+ vpternlogd $0x96, V4, V5, TWEAK3
+.else
+ vpxor V0, TWEAK1, TWEAK1
+ vpxor V2, TWEAK2, TWEAK2
+ vpxor V4, TWEAK3, TWEAK3
+ vpxor V1, TWEAK1, TWEAK1
+ vpxor V3, TWEAK2, TWEAK2
+ vpxor V5, TWEAK3, TWEAK3
+.endif
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the method of just
+// multiplying by x repeatedly (the same method _next_tweak uses).
+.macro _tweak_step_mulx i
+.if \i == 0
+ .set PREV_TWEAK, TWEAK3
+ .set NEXT_TWEAK, NEXT_TWEAK0
+.elseif \i == 5
+ .set PREV_TWEAK, NEXT_TWEAK0
+ .set NEXT_TWEAK, NEXT_TWEAK1
+.elseif \i == 10
+ .set PREV_TWEAK, NEXT_TWEAK1
+ .set NEXT_TWEAK, NEXT_TWEAK2
+.elseif \i == 15
+ .set PREV_TWEAK, NEXT_TWEAK2
+ .set NEXT_TWEAK, NEXT_TWEAK3
+.endif
+.if \i >= 0 && \i < 20 && \i % 5 == 0
+ vpshufd $0x13, PREV_TWEAK, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 1
+ vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
+.elseif \i >= 0 && \i < 20 && \i % 5 == 2
+ vpsrad $31, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 3
+ vpand GF_POLY, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 4
+ vpxor V5, NEXT_TWEAK, NEXT_TWEAK
+.elseif \i == 1000
+ vmovdqa NEXT_TWEAK0, TWEAK0
+ vmovdqa NEXT_TWEAK1, TWEAK1
+ vmovdqa NEXT_TWEAK2, TWEAK2
+ vmovdqa NEXT_TWEAK3, TWEAK3
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
+// (the same method _next_tweakvec uses for VL > 16). This means multiplying
+// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8
+// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
+// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
+.macro _tweak_step_pclmul i
+.if \i == 0
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
+.elseif \i == 2
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
+.elseif \i == 4
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
+.elseif \i == 6
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
+.elseif \i == 8
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
+.elseif \i == 10
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
+.elseif \i == 12
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
+.elseif \i == 14
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
+.elseif \i == 1000
+ vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
+ vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1
+ vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2
+ vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3
+ _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0
+ _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1
+ _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2
+ _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3
+.endif
+.endm
+
+// _tweak_step does one step of the computation of the next set of tweaks from
+// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
+// \i that include at least 0 through 19, then 1000 which signals the last step.
+//
+// This is used to interleave the computation of the next set of tweaks with the
+// AES en/decryptions, which increases performance in some cases.
+.macro _tweak_step i
+.if VL == 16
+ _tweak_step_mulx \i
+.else
+ _tweak_step_pclmul \i
+.endif
+.endm
+
+.macro _setup_round_keys enc
+
+ // Select either the encryption round keys or the decryption round keys.
+.if \enc
+ .set OFFS, 0
+.else
+ .set OFFS, 240
+.endif
+
+ // Load the round key for "round 0".
+ _vbroadcast128 OFFS(KEY), KEY0
+
+ // Increment KEY to make it so that 7*16(KEY) is the last round key.
+ // For AES-128, increment by 3*16, resulting in the 10 round keys (not
+ // counting the zero-th round key which was just loaded into KEY0) being
+ // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
+ // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
+ // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
+ //
+ // This rebasing provides two benefits. First, it makes the offset to
+ // any round key be in the range [-96, 112], fitting in a signed byte.
+ // This shortens VEX-encoded instructions that access the later round
+ // keys which otherwise would need 4-byte offsets. Second, it makes it
+ // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
+ // beginning. Skipping rounds at the end doesn't work as well because
+ // the last round needs different instructions.
+ //
+ // An alternative approach would be to roll up all the round loops. We
+ // don't do that because it isn't compatible with caching the round keys
+ // in registers which we do when possible (see below), and also because
+ // it seems unwise to rely *too* heavily on the CPU's branch predictor.
+ lea OFFS-16(KEY, KEYLEN64, 4), KEY
+
+ // If all 32 SIMD registers are available, cache all the round keys.
+.if USE_AVX10
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ _vbroadcast128 -6*16(KEY), KEY1
+ _vbroadcast128 -5*16(KEY), KEY2
+.Laes192\@:
+ _vbroadcast128 -4*16(KEY), KEY3
+ _vbroadcast128 -3*16(KEY), KEY4
+.Laes128\@:
+ _vbroadcast128 -2*16(KEY), KEY5
+ _vbroadcast128 -1*16(KEY), KEY6
+ _vbroadcast128 0*16(KEY), KEY7
+ _vbroadcast128 1*16(KEY), KEY8
+ _vbroadcast128 2*16(KEY), KEY9
+ _vbroadcast128 3*16(KEY), KEY10
+ _vbroadcast128 4*16(KEY), KEY11
+ _vbroadcast128 5*16(KEY), KEY12
+ _vbroadcast128 6*16(KEY), KEY13
+ _vbroadcast128 7*16(KEY), KEY14
+.endif
+.endm
+
+// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
+// on the block(s) in \data using the round key(s) in \key. The register length
+// determines the number of AES blocks en/decrypted.
+.macro _vaes enc, last, key, data
+.if \enc
+.if \last
+ vaesenclast \key, \data, \data
+.else
+ vaesenc \key, \data, \data
+.endif
+.else
+.if \last
+ vaesdeclast \key, \data, \data
+.else
+ vaesdec \key, \data, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the block(s) in \data, using the
+// same key for all block(s). The round key is loaded from the appropriate
+// register or memory location for round \i. May clobber V4.
+.macro _vaes_1x enc, last, i, xmm_suffix, data
+.if USE_AVX10
+ _vaes \enc, \last, KEY\i\xmm_suffix, \data
+.else
+.ifnb \xmm_suffix
+ _vaes \enc, \last, (\i-7)*16(KEY), \data
+.else
+ _vbroadcast128 (\i-7)*16(KEY), V4
+ _vaes \enc, \last, V4, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the blocks in registers V0-V3,
+// using the same key for all blocks. The round key is loaded from the
+// appropriate register or memory location for round \i. In addition, does two
+// steps of the computation of the next set of tweaks. May clobber V4.
+.macro _vaes_4x enc, last, i
+.if USE_AVX10
+ _tweak_step (2*(\i-5))
+ _vaes \enc, \last, KEY\i, V0
+ _vaes \enc, \last, KEY\i, V1
+ _tweak_step (2*(\i-5) + 1)
+ _vaes \enc, \last, KEY\i, V2
+ _vaes \enc, \last, KEY\i, V3
+.else
+ _vbroadcast128 (\i-7)*16(KEY), V4
+ _tweak_step (2*(\i-5))
+ _vaes \enc, \last, V4, V0
+ _vaes \enc, \last, V4, V1
+ _tweak_step (2*(\i-5) + 1)
+ _vaes \enc, \last, V4, V2
+ _vaes \enc, \last, V4, V3
+.endif
+.endm
+
+// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
+// then XOR with \tweak again) of the block(s) in \data. To process a single
+// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
+// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
+.macro _aes_crypt enc, xmm_suffix, tweak, data
+ _xor3 KEY0\xmm_suffix, \tweak, \data
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ _vaes_1x \enc, 0, 1, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 2, \xmm_suffix, \data
+.Laes192\@:
+ _vaes_1x \enc, 0, 3, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 4, \xmm_suffix, \data
+.Laes128\@:
+ _vaes_1x \enc, 0, 5, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 6, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 7, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 8, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 9, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 10, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 11, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 12, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 13, \xmm_suffix, \data
+ _vaes_1x \enc, 1, 14, \xmm_suffix, \data
+ _vpxor \tweak, \data, \data
+.endm
+
+.macro _aes_xts_crypt enc
+ _define_aliases
+
+.if !\enc
+ // When decrypting a message whose length isn't a multiple of the AES
+ // block length, exclude the last full block from the main loop by
+ // subtracting 16 from LEN. This is needed because ciphertext stealing
+ // decryption uses the last two tweaks in reverse order. We'll handle
+ // the last full block and the partial block specially at the end.
+ lea -16(LEN), %eax
+ test $15, LEN8
+ cmovnz %eax, LEN
+.endif
+
+ // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+ movl 480(KEY), KEYLEN
+
+ // Setup the pointer to the round keys and cache as many as possible.
+ _setup_round_keys \enc
+
+ // Compute the first set of tweaks TWEAK[0-3].
+ _compute_first_set_of_tweaks
+
+ sub $4*VL, LEN
+ jl .Lhandle_remainder\@
+
+.Lmain_loop\@:
+ // This is the main loop, en/decrypting 4*VL bytes per iteration.
+
+ // XOR each source block with its tweak and the zero-th round key.
+.if USE_AVX10
+ vmovdqu8 0*VL(SRC), V0
+ vmovdqu8 1*VL(SRC), V1
+ vmovdqu8 2*VL(SRC), V2
+ vmovdqu8 3*VL(SRC), V3
+ vpternlogd $0x96, TWEAK0, KEY0, V0
+ vpternlogd $0x96, TWEAK1, KEY0, V1
+ vpternlogd $0x96, TWEAK2, KEY0, V2
+ vpternlogd $0x96, TWEAK3, KEY0, V3
+.else
+ vpxor 0*VL(SRC), KEY0, V0
+ vpxor 1*VL(SRC), KEY0, V1
+ vpxor 2*VL(SRC), KEY0, V2
+ vpxor 3*VL(SRC), KEY0, V3
+ vpxor TWEAK0, V0, V0
+ vpxor TWEAK1, V1, V1
+ vpxor TWEAK2, V2, V2
+ vpxor TWEAK3, V3, V3
+.endif
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ // Do all the AES rounds on the data blocks, interleaved with
+ // the computation of the next set of tweaks.
+ _vaes_4x \enc, 0, 1
+ _vaes_4x \enc, 0, 2
+.Laes192\@:
+ _vaes_4x \enc, 0, 3
+ _vaes_4x \enc, 0, 4
+.Laes128\@:
+ _vaes_4x \enc, 0, 5
+ _vaes_4x \enc, 0, 6
+ _vaes_4x \enc, 0, 7
+ _vaes_4x \enc, 0, 8
+ _vaes_4x \enc, 0, 9
+ _vaes_4x \enc, 0, 10
+ _vaes_4x \enc, 0, 11
+ _vaes_4x \enc, 0, 12
+ _vaes_4x \enc, 0, 13
+ _vaes_4x \enc, 1, 14
+
+ // XOR in the tweaks again.
+ _vpxor TWEAK0, V0, V0
+ _vpxor TWEAK1, V1, V1
+ _vpxor TWEAK2, V2, V2
+ _vpxor TWEAK3, V3, V3
+
+ // Store the destination blocks.
+ _vmovdqu V0, 0*VL(DST)
+ _vmovdqu V1, 1*VL(DST)
+ _vmovdqu V2, 2*VL(DST)
+ _vmovdqu V3, 3*VL(DST)
+
+ // Finish computing the next set of tweaks.
+ _tweak_step 1000
+
+ add $4*VL, SRC
+ add $4*VL, DST
+ sub $4*VL, LEN
+ jge .Lmain_loop\@
+
+ // Check for the uncommon case where the data length isn't a multiple of
+ // 4*VL. Handle it out-of-line in order to optimize for the common
+ // case. In the common case, just fall through to the ret.
+ test $4*VL-1, LEN8
+ jnz .Lhandle_remainder\@
+.Ldone\@:
+ // Store the next tweak back to *TWEAK to support continuation calls.
+ vmovdqu TWEAK0_XMM, (TWEAK)
+.if VL > 16
+ vzeroupper
+.endif
+ RET
+
+.Lhandle_remainder\@:
+
+ // En/decrypt any remaining full blocks, one vector at a time.
+.if VL > 16
+ add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
+ jl .Lvec_at_a_time_done\@
+.Lvec_at_a_time\@:
+ _vmovdqu (SRC), V0
+ _aes_crypt \enc, , TWEAK0, V0
+ _vmovdqu V0, (DST)
+ _next_tweakvec TWEAK0, V0, V1, TWEAK0
+ add $VL, SRC
+ add $VL, DST
+ sub $VL, LEN
+ jge .Lvec_at_a_time\@
+.Lvec_at_a_time_done\@:
+ add $VL-16, LEN // Undo extra sub of VL, then sub 16.
+.else
+ add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
+.endif
+
+ // En/decrypt any remaining full blocks, one at a time.
+ jl .Lblock_at_a_time_done\@
+.Lblock_at_a_time\@:
+ vmovdqu (SRC), %xmm0
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
+ vmovdqu %xmm0, (DST)
+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
+ add $16, SRC
+ add $16, DST
+ sub $16, LEN
+ jge .Lblock_at_a_time\@
+.Lblock_at_a_time_done\@:
+ add $16, LEN // Undo the extra sub of 16.
+ // Now 0 <= LEN <= 15. If LEN is zero, we're done.
+ jz .Ldone\@
+
+ // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
+ // Do ciphertext stealing to process the last 16 + LEN bytes.
+
+.if \enc
+ // If encrypting, the main loop already encrypted the last full block to
+ // create the CTS intermediate ciphertext. Prepare for the rest of CTS
+ // by rewinding the pointers and loading the intermediate ciphertext.
+ sub $16, SRC
+ sub $16, DST
+ vmovdqu (DST), %xmm0
+.else
+ // If decrypting, the main loop didn't decrypt the last full block
+ // because CTS decryption uses the last two tweaks in reverse order.
+ // Do it now by advancing the tweak and decrypting the last full block.
+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
+ vmovdqu (SRC), %xmm0
+ _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
+.endif
+
+.if USE_AVX10
+ // Create a mask that has the first LEN bits set.
+ mov $-1, %r9d
+ bzhi LEN, %r9d, %r9d
+ kmovd %r9d, %k1
+
+ // Swap the first LEN bytes of the en/decryption of the last full block
+ // with the partial block. Note that to support in-place en/decryption,
+ // the load from the src partial block must happen before the store to
+ // the dst partial block.
+ vmovdqa %xmm0, %xmm1
+ vmovdqu8 16(SRC), %xmm0{%k1}
+ vmovdqu8 %xmm1, 16(DST){%k1}
+.else
+ lea .Lcts_permute_table(%rip), %r9
+
+ // Load the src partial block, left-aligned. Note that to support
+ // in-place en/decryption, this must happen before the store to the dst
+ // partial block.
+ vmovdqu (SRC, LEN64, 1), %xmm1
+
+ // Shift the first LEN bytes of the en/decryption of the last full block
+ // to the end of a register, then store it to DST+LEN. This stores the
+ // dst partial block. It also writes to the second part of the dst last
+ // full block, but that part is overwritten later.
+ vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
+ vmovdqu %xmm2, (DST, LEN64, 1)
+
+ // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
+ sub LEN64, %r9
+ vmovdqu 32(%r9), %xmm3
+
+ // Shift the src partial block to the beginning of its register.
+ vpshufb %xmm3, %xmm1, %xmm1
+
+ // Do a blend to generate the src partial block followed by the second
+ // part of the en/decryption of the last full block.
+ vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
+.endif
+ // En/decrypt again and store the last full block.
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
+ vmovdqu %xmm0, (DST)
+ jmp .Ldone\@
+.endm
+
+// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+// u8 iv[AES_BLOCK_SIZE]);
+SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
+ vmovdqu (%rsi), %xmm0
+ vpxor (%rdi), %xmm0, %xmm0
+ movl 480(%rdi), %eax // AES key length
+ lea -16(%rdi, %rax, 4), %rdi
+ cmp $24, %eax
+ jl .Lencrypt_iv_aes128
+ je .Lencrypt_iv_aes192
+ vaesenc -6*16(%rdi), %xmm0, %xmm0
+ vaesenc -5*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes192:
+ vaesenc -4*16(%rdi), %xmm0, %xmm0
+ vaesenc -3*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes128:
+ vaesenc -2*16(%rdi), %xmm0, %xmm0
+ vaesenc -1*16(%rdi), %xmm0, %xmm0
+ vaesenc 0*16(%rdi), %xmm0, %xmm0
+ vaesenc 1*16(%rdi), %xmm0, %xmm0
+ vaesenc 2*16(%rdi), %xmm0, %xmm0
+ vaesenc 3*16(%rdi), %xmm0, %xmm0
+ vaesenc 4*16(%rdi), %xmm0, %xmm0
+ vaesenc 5*16(%rdi), %xmm0, %xmm0
+ vaesenc 6*16(%rdi), %xmm0, %xmm0
+ vaesenclast 7*16(%rdi), %xmm0, %xmm0
+ vmovdqu %xmm0, (%rsi)
+ RET
+SYM_FUNC_END(aes_xts_encrypt_iv)
+
+// Below are the actual AES-XTS encryption and decryption functions,
+// instantiated from the above macro. They all have the following prototype:
+//
+// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
+// const u8 *src, u8 *dst, unsigned int len,
+// u8 tweak[AES_BLOCK_SIZE]);
+//
+// |key| is the data key. |tweak| contains the next tweak; the encryption of
+// the original IV with the tweak key was already done. This function supports
+// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
+// |len| must be a multiple of 16 except on the last call. If |len| is a
+// multiple of 16, then this function updates |tweak| to contain the next tweak.
+
+.set VL, 16
+.set USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
+
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+.set VL, 32
+.set USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
+
+.set VL, 32
+.set USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
+
+.set VL, 64
+.set USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 411d8c83e8..39066b57a7 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -83,9 +83,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.text
-
-#define STACK_OFFSET 8*3
-
#define AadHash 16*0
#define AadLen 16*1
#define InLen (16*1)+8
@@ -116,11 +113,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define arg4 rcx
#define arg5 r8
#define arg6 r9
-#define arg7 STACK_OFFSET+8(%rsp)
-#define arg8 STACK_OFFSET+16(%rsp)
-#define arg9 STACK_OFFSET+24(%rsp)
-#define arg10 STACK_OFFSET+32(%rsp)
-#define arg11 STACK_OFFSET+40(%rsp)
#define keysize 2*15*16(%arg1)
#endif
@@ -1507,184 +1499,6 @@ _esb_loop_\@:
MOVADQ (%r10),\TMP1
aesenclast \TMP1,\XMM0
.endm
-/*****************************************************************************
-* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data
-* // Context data
-* u8 *out, // Plaintext output. Encrypt in-place is allowed.
-* const u8 *in, // Ciphertext input
-* u64 plaintext_len, // Length of data in bytes for decryption.
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
-* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
-* // given authentication tag and only return the plaintext if they match.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
-* // (most likely), 12 or 8.
-*
-* Assumptions:
-*
-* keys:
-* keys are pre-expanded and aligned to 16 bytes. we are using the first
-* set of 11 keys in the data structure void *aes_ctx
-*
-* iv:
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Salt (From the SA) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Initialization Vector |
-* | (This is the sequence number from IPSec header) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x1 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-*
-*
-* AAD:
-* AAD padded to 128 bits with 0
-* for example, assume AAD is a u32 vector
-*
-* if AAD is 8 bytes:
-* AAD[3] = {A0, A1};
-* padded AAD in xmm register = {A1 A0 0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A1) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 32-bit Sequence Number (A0) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 32-bit Sequence Number
-*
-* if AAD is 12 bytes:
-* AAD[3] = {A0, A1, A2};
-* padded AAD in xmm register = {A2 A1 A0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A2) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 64-bit Extended Sequence Number {A1,A0} |
-* | |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 64-bit Extended Sequence Number
-*
-* poly = x^128 + x^127 + x^126 + x^121 + 1
-*
-*****************************************************************************/
-SYM_FUNC_START(aesni_gcm_dec)
- FUNC_SAVE
-
- GCM_INIT %arg6, arg7, arg8, arg9
- GCM_ENC_DEC dec
- GCM_COMPLETE arg10, arg11
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec)
-
-
-/*****************************************************************************
-* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data
-* // Context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
-* u8 *auth_tag, // Authenticated Tag output.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
-* // 12 or 8.
-*
-* Assumptions:
-*
-* keys:
-* keys are pre-expanded and aligned to 16 bytes. we are using the
-* first set of 11 keys in the data structure void *aes_ctx
-*
-*
-* iv:
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Salt (From the SA) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Initialization Vector |
-* | (This is the sequence number from IPSec header) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x1 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-*
-*
-* AAD:
-* AAD padded to 128 bits with 0
-* for example, assume AAD is a u32 vector
-*
-* if AAD is 8 bytes:
-* AAD[3] = {A0, A1};
-* padded AAD in xmm register = {A1 A0 0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A1) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 32-bit Sequence Number (A0) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 32-bit Sequence Number
-*
-* if AAD is 12 bytes:
-* AAD[3] = {A0, A1, A2};
-* padded AAD in xmm register = {A2 A1 A0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A2) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 64-bit Extended Sequence Number {A1,A0} |
-* | |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 64-bit Extended Sequence Number
-*
-* poly = x^128 + x^127 + x^126 + x^121 + 1
-***************************************************************************/
-SYM_FUNC_START(aesni_gcm_enc)
- FUNC_SAVE
-
- GCM_INIT %arg6, arg7, arg8, arg9
- GCM_ENC_DEC enc
-
- GCM_COMPLETE arg10, arg11
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc)
/*****************************************************************************
* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
@@ -1820,8 +1634,8 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b)
SYM_FUNC_END(_key_expansion_256b)
/*
- * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- * unsigned int key_len)
+ * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
*/
SYM_FUNC_START(aesni_set_key)
FRAME_BEGIN
@@ -1926,7 +1740,6 @@ SYM_FUNC_START(aesni_set_key)
sub $0x10, UKEYP
cmp TKEYP, KEYP
jb .Ldec_key_loop
- xor AREG, AREG
#ifndef __x86_64__
popl KEYP
#endif
@@ -2826,28 +2639,24 @@ SYM_FUNC_END(aesni_ctr_enc)
.previous
/*
- * _aesni_gf128mul_x_ble: internal ABI
- * Multiply in GF(2^128) for XTS IVs
+ * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
* input:
* IV: current IV
* GF128MUL_MASK == mask with 0x87 and 0x01
* output:
* IV: next IV
* changed:
- * CTR: == temporary value
+ * KEY: == temporary value
*/
-#define _aesni_gf128mul_x_ble() \
- pshufd $0x13, IV, KEY; \
- paddq IV, IV; \
- psrad $31, KEY; \
- pand GF128MUL_MASK, KEY; \
- pxor KEY, IV;
+.macro _aesni_gf128mul_x_ble
+ pshufd $0x13, IV, KEY
+ paddq IV, IV
+ psrad $31, KEY
+ pand GF128MUL_MASK, KEY
+ pxor KEY, IV
+.endm
-/*
- * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, unsigned int len, le128 *iv)
- */
-SYM_FUNC_START(aesni_xts_encrypt)
+.macro _aesni_xts_crypt enc
FRAME_BEGIN
#ifndef __x86_64__
pushl IVP
@@ -2866,35 +2675,46 @@ SYM_FUNC_START(aesni_xts_encrypt)
movups (IVP), IV
mov 480(KEYP), KLEN
+.if !\enc
+ add $240, KEYP
-.Lxts_enc_loop4:
+ test $15, LEN
+ jz .Lxts_loop4\@
+ sub $16, LEN
+.endif
+
+.Lxts_loop4\@:
sub $64, LEN
- jl .Lxts_enc_1x
+ jl .Lxts_1x\@
movdqa IV, STATE1
movdqu 0x00(INP), IN
pxor IN, STATE1
movdqu IV, 0x00(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE2
movdqu 0x10(INP), IN
pxor IN, STATE2
movdqu IV, 0x10(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE3
movdqu 0x20(INP), IN
pxor IN, STATE3
movdqu IV, 0x20(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE4
movdqu 0x30(INP), IN
pxor IN, STATE4
movdqu IV, 0x30(OUTP)
+.if \enc
call _aesni_enc4
+.else
+ call _aesni_dec4
+.endif
movdqu 0x00(OUTP), IN
pxor IN, STATE1
@@ -2912,17 +2732,17 @@ SYM_FUNC_START(aesni_xts_encrypt)
pxor IN, STATE4
movdqu STATE4, 0x30(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
add $64, INP
add $64, OUTP
test LEN, LEN
- jnz .Lxts_enc_loop4
+ jnz .Lxts_loop4\@
-.Lxts_enc_ret_iv:
+.Lxts_ret_iv\@:
movups IV, (IVP)
-.Lxts_enc_ret:
+.Lxts_ret\@:
#ifndef __x86_64__
popl KLEN
popl KEYP
@@ -2932,201 +2752,60 @@ SYM_FUNC_START(aesni_xts_encrypt)
FRAME_END
RET
-.Lxts_enc_1x:
+.Lxts_1x\@:
add $64, LEN
- jz .Lxts_enc_ret_iv
+ jz .Lxts_ret_iv\@
+.if \enc
sub $16, LEN
- jl .Lxts_enc_cts4
+ jl .Lxts_cts4\@
+.endif
-.Lxts_enc_loop1:
+.Lxts_loop1\@:
movdqu (INP), STATE
+.if \enc
pxor IV, STATE
call _aesni_enc1
- pxor IV, STATE
- _aesni_gf128mul_x_ble()
-
- test LEN, LEN
- jz .Lxts_enc_out
-
+.else
add $16, INP
sub $16, LEN
- jl .Lxts_enc_cts1
-
- movdqu STATE, (OUTP)
- add $16, OUTP
- jmp .Lxts_enc_loop1
-
-.Lxts_enc_out:
- movdqu STATE, (OUTP)
- jmp .Lxts_enc_ret_iv
-
-.Lxts_enc_cts4:
- movdqa STATE4, STATE
- sub $16, OUTP
-
-.Lxts_enc_cts1:
-#ifndef __x86_64__
- lea .Lcts_permute_table, T1
-#else
- lea .Lcts_permute_table(%rip), T1
-#endif
- add LEN, INP /* rewind input pointer */
- add $16, LEN /* # bytes in final block */
- movups (INP), IN1
-
- mov T1, IVP
- add $32, IVP
- add LEN, T1
- sub LEN, IVP
- add OUTP, LEN
-
- movups (T1), %xmm4
- movaps STATE, IN2
- pshufb %xmm4, STATE
- movups STATE, (LEN)
-
- movups (IVP), %xmm0
- pshufb %xmm0, IN1
- pblendvb IN2, IN1
- movaps IN1, STATE
-
+ jl .Lxts_cts1\@
pxor IV, STATE
- call _aesni_enc1
+ call _aesni_dec1
+.endif
pxor IV, STATE
+ _aesni_gf128mul_x_ble
- movups STATE, (OUTP)
- jmp .Lxts_enc_ret
-SYM_FUNC_END(aesni_xts_encrypt)
-
-/*
- * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, unsigned int len, le128 *iv)
- */
-SYM_FUNC_START(aesni_xts_decrypt)
- FRAME_BEGIN
-#ifndef __x86_64__
- pushl IVP
- pushl LEN
- pushl KEYP
- pushl KLEN
- movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
- movl (FRAME_OFFSET+24)(%esp), OUTP # dst
- movl (FRAME_OFFSET+28)(%esp), INP # src
- movl (FRAME_OFFSET+32)(%esp), LEN # len
- movl (FRAME_OFFSET+36)(%esp), IVP # iv
- movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
-#else
- movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
-#endif
- movups (IVP), IV
-
- mov 480(KEYP), KLEN
- add $240, KEYP
-
- test $15, LEN
- jz .Lxts_dec_loop4
- sub $16, LEN
-
-.Lxts_dec_loop4:
- sub $64, LEN
- jl .Lxts_dec_1x
-
- movdqa IV, STATE1
- movdqu 0x00(INP), IN
- pxor IN, STATE1
- movdqu IV, 0x00(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x10(INP), IN
- pxor IN, STATE2
- movdqu IV, 0x10(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x20(INP), IN
- pxor IN, STATE3
- movdqu IV, 0x20(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE4
- movdqu 0x30(INP), IN
- pxor IN, STATE4
- movdqu IV, 0x30(OUTP)
-
- call _aesni_dec4
-
- movdqu 0x00(OUTP), IN
- pxor IN, STATE1
- movdqu STATE1, 0x00(OUTP)
-
- movdqu 0x10(OUTP), IN
- pxor IN, STATE2
- movdqu STATE2, 0x10(OUTP)
-
- movdqu 0x20(OUTP), IN
- pxor IN, STATE3
- movdqu STATE3, 0x20(OUTP)
-
- movdqu 0x30(OUTP), IN
- pxor IN, STATE4
- movdqu STATE4, 0x30(OUTP)
-
- _aesni_gf128mul_x_ble()
-
- add $64, INP
- add $64, OUTP
test LEN, LEN
- jnz .Lxts_dec_loop4
-
-.Lxts_dec_ret_iv:
- movups IV, (IVP)
-
-.Lxts_dec_ret:
-#ifndef __x86_64__
- popl KLEN
- popl KEYP
- popl LEN
- popl IVP
-#endif
- FRAME_END
- RET
-
-.Lxts_dec_1x:
- add $64, LEN
- jz .Lxts_dec_ret_iv
-
-.Lxts_dec_loop1:
- movdqu (INP), STATE
+ jz .Lxts_out\@
+.if \enc
add $16, INP
sub $16, LEN
- jl .Lxts_dec_cts1
-
- pxor IV, STATE
- call _aesni_dec1
- pxor IV, STATE
- _aesni_gf128mul_x_ble()
-
- test LEN, LEN
- jz .Lxts_dec_out
+ jl .Lxts_cts1\@
+.endif
movdqu STATE, (OUTP)
add $16, OUTP
- jmp .Lxts_dec_loop1
+ jmp .Lxts_loop1\@
-.Lxts_dec_out:
+.Lxts_out\@:
movdqu STATE, (OUTP)
- jmp .Lxts_dec_ret_iv
+ jmp .Lxts_ret_iv\@
-.Lxts_dec_cts1:
+.if \enc
+.Lxts_cts4\@:
+ movdqa STATE4, STATE
+ sub $16, OUTP
+.Lxts_cts1\@:
+.else
+.Lxts_cts1\@:
movdqa IV, STATE4
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
pxor IV, STATE
call _aesni_dec1
pxor IV, STATE
-
+.endif
#ifndef __x86_64__
lea .Lcts_permute_table, T1
#else
@@ -3152,10 +2831,32 @@ SYM_FUNC_START(aesni_xts_decrypt)
pblendvb IN2, IN1
movaps IN1, STATE
+.if \enc
+ pxor IV, STATE
+ call _aesni_enc1
+ pxor IV, STATE
+.else
pxor STATE4, STATE
call _aesni_dec1
pxor STATE4, STATE
+.endif
movups STATE, (OUTP)
- jmp .Lxts_dec_ret
-SYM_FUNC_END(aesni_xts_decrypt)
+ jmp .Lxts_ret\@
+.endm
+
+/*
+ * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_enc)
+ _aesni_xts_crypt 1
+SYM_FUNC_END(aesni_xts_enc)
+
+/*
+ * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_dec)
+ _aesni_xts_crypt 0
+SYM_FUNC_END(aesni_xts_dec)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index b1d90c2597..ef031655b2 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -40,7 +40,6 @@
#define AESNI_ALIGN 16
#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1))
-#define RFC4106_HASH_SUBKEY_SIZE 16
#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
@@ -87,8 +86,8 @@ static inline void *aes_align_addr(void *addr)
return PTR_ALIGN(addr, AESNI_ALIGN);
}
-asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- unsigned int key_len);
+asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ unsigned int key_len);
asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
@@ -107,11 +106,11 @@ asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
-asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
-asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
#ifdef CONFIG_X86_64
@@ -233,19 +232,17 @@ static int aes_set_key_common(struct crypto_aes_ctx *ctx,
{
int err;
- if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
- key_len != AES_KEYSIZE_256)
- return -EINVAL;
-
if (!crypto_simd_usable())
- err = aes_expandkey(ctx, in_key, key_len);
- else {
- kernel_fpu_begin();
- err = aesni_set_key(ctx, in_key, key_len);
- kernel_fpu_end();
- }
+ return aes_expandkey(ctx, in_key, key_len);
- return err;
+ err = aes_check_keylen(key_len);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_set_key(ctx, in_key, key_len);
+ kernel_fpu_end();
+ return 0;
}
static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
@@ -592,23 +589,12 @@ static int xctr_crypt(struct skcipher_request *req)
return err;
}
-static int
-rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+static int aes_gcm_derive_hash_subkey(const struct crypto_aes_ctx *aes_key,
+ u8 hash_subkey[AES_BLOCK_SIZE])
{
- struct crypto_aes_ctx ctx;
- int ret;
-
- ret = aes_expandkey(&ctx, key, key_len);
- if (ret)
- return ret;
-
- /* Clear the data in the hash sub key container to zero.*/
- /* We want to cipher all zeros to create the hash sub key. */
- memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
-
- aes_encrypt(&ctx, hash_subkey, hash_subkey);
+ static const u8 zeroes[AES_BLOCK_SIZE];
- memzero_explicit(&ctx, sizeof(ctx));
+ aes_encrypt(aes_key, hash_subkey, zeroes);
return 0;
}
@@ -626,7 +612,8 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
- rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+ aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded,
+ ctx->hash_subkey);
}
/* This is the Integrity Check Value (aka the authentication tag) length and can
@@ -877,7 +864,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
}
#endif
-static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
+static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
@@ -898,108 +885,149 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen);
}
-static int xts_crypt(struct skcipher_request *req, bool encrypt)
+typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE]);
+typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, unsigned int len,
+ u8 tweak[AES_BLOCK_SIZE]);
+
+/* This handles cases where the source and/or destination span pages. */
+static noinline int
+xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct skcipher_walk walk;
+ struct scatterlist *src, *dst;
int err;
- if (req->cryptlen < AES_BLOCK_SIZE)
- return -EINVAL;
-
- err = skcipher_walk_virt(&walk, req, false);
- if (!walk.nbytes)
- return err;
-
- if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
- int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
-
- skcipher_walk_abort(&walk);
-
+ /*
+ * If the message length isn't divisible by the AES block size, then
+ * separate off the last full block and the partial block. This ensures
+ * that they are processed in the same call to the assembly function,
+ * which is required for ciphertext stealing.
+ */
+ if (tail) {
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
- blocks * AES_BLOCK_SIZE, req->iv);
+ req->cryptlen - tail - AES_BLOCK_SIZE,
+ req->iv);
req = &subreq;
+ }
- err = skcipher_walk_virt(&walk, req, false);
- if (!walk.nbytes)
- return err;
- } else {
- tail = 0;
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while (walk.nbytes) {
+ kernel_fpu_begin();
+ (*crypt_func)(&ctx->crypt_ctx,
+ walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv);
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk,
+ walk.nbytes & (AES_BLOCK_SIZE - 1));
}
- kernel_fpu_begin();
+ if (err || !tail)
+ return err;
- /* calculate first value of T */
- aesni_enc(&ctx->tweak_ctx, walk.iv, walk.iv);
+ /* Do ciphertext stealing with the last full block and partial block. */
- while (walk.nbytes > 0) {
- int nbytes = walk.nbytes;
-
- if (nbytes < walk.total)
- nbytes &= ~(AES_BLOCK_SIZE - 1);
-
- if (encrypt)
- aesni_xts_encrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, walk.iv);
- else
- aesni_xts_decrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, walk.iv);
- kernel_fpu_end();
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
- if (walk.nbytes > 0)
- kernel_fpu_begin();
- }
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
- if (unlikely(tail > 0 && !err)) {
- struct scatterlist sg_src[2], sg_dst[2];
- struct scatterlist *src, *dst;
+ kernel_fpu_begin();
+ (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, req->iv);
+ kernel_fpu_end();
- dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
- if (req->dst != req->src)
- dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+ return skcipher_walk_done(&walk, 0);
+}
- skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
- req->iv);
+/* __always_inline to avoid indirect call in fastpath */
+static __always_inline int
+xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv,
+ xts_crypt_func crypt_func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
+ const unsigned int cryptlen = req->cryptlen;
+ struct scatterlist *src = req->src;
+ struct scatterlist *dst = req->dst;
- err = skcipher_walk_virt(&walk, &subreq, false);
- if (err)
- return err;
+ if (unlikely(cryptlen < AES_BLOCK_SIZE))
+ return -EINVAL;
- kernel_fpu_begin();
- if (encrypt)
- aesni_xts_encrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes, walk.iv);
- else
- aesni_xts_decrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes, walk.iv);
- kernel_fpu_end();
+ kernel_fpu_begin();
+ (*encrypt_iv)(&ctx->tweak_ctx, req->iv);
- err = skcipher_walk_done(&walk, 0);
+ /*
+ * In practice, virtually all XTS plaintexts and ciphertexts are either
+ * 512 or 4096 bytes, aligned such that they don't span page boundaries.
+ * To optimize the performance of these cases, and also any other case
+ * where no page boundary is spanned, the below fast-path handles
+ * single-page sources and destinations as efficiently as possible.
+ */
+ if (likely(src->length >= cryptlen && dst->length >= cryptlen &&
+ src->offset + cryptlen <= PAGE_SIZE &&
+ dst->offset + cryptlen <= PAGE_SIZE)) {
+ struct page *src_page = sg_page(src);
+ struct page *dst_page = sg_page(dst);
+ void *src_virt = kmap_local_page(src_page) + src->offset;
+ void *dst_virt = kmap_local_page(dst_page) + dst->offset;
+
+ (*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen,
+ req->iv);
+ kunmap_local(dst_virt);
+ kunmap_local(src_virt);
+ kernel_fpu_end();
+ return 0;
}
- return err;
+ kernel_fpu_end();
+ return xts_crypt_slowpath(req, crypt_func);
+}
+
+static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE])
+{
+ aesni_enc(tweak_key, iv, iv);
+}
+
+static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, unsigned int len,
+ u8 tweak[AES_BLOCK_SIZE])
+{
+ aesni_xts_enc(key, dst, src, len, tweak);
}
-static int xts_encrypt(struct skcipher_request *req)
+static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, unsigned int len,
+ u8 tweak[AES_BLOCK_SIZE])
{
- return xts_crypt(req, true);
+ aesni_xts_dec(key, dst, src, len, tweak);
}
-static int xts_decrypt(struct skcipher_request *req)
+static int xts_encrypt_aesni(struct skcipher_request *req)
{
- return xts_crypt(req, false);
+ return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt);
+}
+
+static int xts_decrypt_aesni(struct skcipher_request *req)
+{
+ return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt);
}
static struct crypto_alg aesni_cipher_alg = {
@@ -1103,9 +1131,9 @@ static struct skcipher_alg aesni_skciphers[] = {
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.walksize = 2 * AES_BLOCK_SIZE,
- .setkey = xts_aesni_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
+ .setkey = xts_setkey_aesni,
+ .encrypt = xts_encrypt_aesni,
+ .decrypt = xts_decrypt_aesni,
}
};
@@ -1137,7 +1165,149 @@ static struct skcipher_alg aesni_xctr = {
};
static struct simd_skcipher_alg *aesni_simd_xctr;
-#endif /* CONFIG_X86_64 */
+
+asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE]);
+
+#define DEFINE_XTS_ALG(suffix, driver_name, priority) \
+ \
+asmlinkage void \
+aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+asmlinkage void \
+aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+ \
+static int xts_encrypt_##suffix(struct skcipher_request *req) \
+{ \
+ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \
+} \
+ \
+static int xts_decrypt_##suffix(struct skcipher_request *req) \
+{ \
+ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \
+} \
+ \
+static struct skcipher_alg aes_xts_alg_##suffix = { \
+ .base = { \
+ .cra_name = "__xts(aes)", \
+ .cra_driver_name = "__" driver_name, \
+ .cra_priority = priority, \
+ .cra_flags = CRYPTO_ALG_INTERNAL, \
+ .cra_blocksize = AES_BLOCK_SIZE, \
+ .cra_ctxsize = XTS_AES_CTX_SIZE, \
+ .cra_module = THIS_MODULE, \
+ }, \
+ .min_keysize = 2 * AES_MIN_KEY_SIZE, \
+ .max_keysize = 2 * AES_MAX_KEY_SIZE, \
+ .ivsize = AES_BLOCK_SIZE, \
+ .walksize = 2 * AES_BLOCK_SIZE, \
+ .setkey = xts_setkey_aesni, \
+ .encrypt = xts_encrypt_##suffix, \
+ .decrypt = xts_decrypt_##suffix, \
+}; \
+ \
+static struct simd_skcipher_alg *aes_xts_simdalg_##suffix
+
+DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500);
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600);
+DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700);
+DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800);
+#endif
+
+/*
+ * This is a list of CPU models that are known to suffer from downclocking when
+ * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS
+ * implementation with zmm registers won't be used by default. An
+ * implementation with ymm registers (256-bit vectors) will be used instead.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, 0),
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ /* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */
+ {},
+};
+
+static int __init register_xts_algs(void)
+{
+ int err;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX))
+ return 0;
+ err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1,
+ &aes_xts_simdalg_aesni_avx);
+ if (err)
+ return err;
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_VAES) ||
+ !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) ||
+ !boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ return 0;
+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1,
+ &aes_xts_simdalg_vaes_avx2);
+ if (err)
+ return err;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
+ !boot_cpu_has(X86_FEATURE_AVX512VL) ||
+ !boot_cpu_has(X86_FEATURE_BMI2) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL))
+ return 0;
+
+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1,
+ &aes_xts_simdalg_vaes_avx10_256);
+ if (err)
+ return err;
+
+ if (x86_match_cpu(zmm_exclusion_list))
+ aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
+
+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1,
+ &aes_xts_simdalg_vaes_avx10_512);
+ if (err)
+ return err;
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
+ return 0;
+}
+
+static void unregister_xts_algs(void)
+{
+ if (aes_xts_simdalg_aesni_avx)
+ simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1,
+ &aes_xts_simdalg_aesni_avx);
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+ if (aes_xts_simdalg_vaes_avx2)
+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1,
+ &aes_xts_simdalg_vaes_avx2);
+ if (aes_xts_simdalg_vaes_avx10_256)
+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1,
+ &aes_xts_simdalg_vaes_avx10_256);
+ if (aes_xts_simdalg_vaes_avx10_512)
+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1,
+ &aes_xts_simdalg_vaes_avx10_512);
+#endif
+}
+#else /* CONFIG_X86_64 */
+static int __init register_xts_algs(void)
+{
+ return 0;
+}
+
+static void unregister_xts_algs(void)
+{
+}
+#endif /* !CONFIG_X86_64 */
#ifdef CONFIG_X86_64
static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
@@ -1146,7 +1316,8 @@ static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
- rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+ aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded,
+ ctx->hash_subkey);
}
static int generic_gcmaes_encrypt(struct aead_request *req)
@@ -1276,13 +1447,21 @@ static int __init aesni_init(void)
goto unregister_aeads;
#endif /* CONFIG_X86_64 */
+ err = register_xts_algs();
+ if (err)
+ goto unregister_xts;
+
return 0;
+unregister_xts:
+ unregister_xts_algs();
#ifdef CONFIG_X86_64
+ if (aesni_simd_xctr)
+ simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
unregister_aeads:
+#endif /* CONFIG_X86_64 */
simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
aesni_simd_aeads);
-#endif /* CONFIG_X86_64 */
unregister_skciphers:
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
@@ -1303,6 +1482,7 @@ static void __exit aesni_exit(void)
if (boot_cpu_has(X86_FEATURE_AVX))
simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
#endif /* CONFIG_X86_64 */
+ unregister_xts_algs();
}
late_initcall(aesni_init);
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 537b6dcd7e..d515a55a3b 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -62,20 +62,41 @@
#define SHA256CONSTANTS %rax
-#define MSG %xmm0
+#define MSG %xmm0 /* sha256rnds2 implicit operand */
#define STATE0 %xmm1
#define STATE1 %xmm2
-#define MSGTMP0 %xmm3
-#define MSGTMP1 %xmm4
-#define MSGTMP2 %xmm5
-#define MSGTMP3 %xmm6
-#define MSGTMP4 %xmm7
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define TMP %xmm7
#define SHUF_MASK %xmm8
#define ABEF_SAVE %xmm9
#define CDGH_SAVE %xmm10
+.macro do_4rounds i, m0, m1, m2, m3
+.if \i < 16
+ movdqu \i*4(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0
+.endif
+ movdqa (\i-32)*4(SHA256CONSTANTS), MSG
+ paddd \m0, MSG
+ sha256rnds2 STATE0, STATE1
+.if \i >= 12 && \i < 60
+ movdqa \m0, TMP
+ palignr $4, \m3, TMP
+ paddd TMP, \m1
+ sha256msg2 \m0, \m1
+.endif
+ punpckhqdq MSG, MSG
+ sha256rnds2 STATE1, STATE0
+.if \i >= 4 && \i < 52
+ sha256msg1 \m0, \m3
+.endif
+.endm
+
/*
* Intel SHA Extensions optimized implementation of a SHA-256 update function
*
@@ -86,9 +107,6 @@
* store partial blocks. All message padding and hash value initialization must
* be done outside the update function.
*
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
* void sha256_ni_transform(uint32_t *digest, const void *data,
uint32_t numBlocks);
* digest : pointer to digest
@@ -108,202 +126,29 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
* Need to reorder these appropriately
* DCBA, HGFE -> ABEF, CDGH
*/
- movdqu 0*16(DIGEST_PTR), STATE0
- movdqu 1*16(DIGEST_PTR), STATE1
+ movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */
+ movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */
- pshufd $0xB1, STATE0, STATE0 /* CDAB */
- pshufd $0x1B, STATE1, STATE1 /* EFGH */
- movdqa STATE0, MSGTMP4
- palignr $8, STATE1, STATE0 /* ABEF */
- pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* FEBA */
+ punpckhqdq TMP, STATE1 /* DCHG */
+ pshufd $0x1B, STATE0, STATE0 /* ABEF */
+ pshufd $0xB1, STATE1, STATE1 /* CDGH */
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
- lea K256(%rip), SHA256CONSTANTS
+ lea K256+32*4(%rip), SHA256CONSTANTS
.Lloop0:
/* Save hash values for addition after rounds */
movdqa STATE0, ABEF_SAVE
movdqa STATE1, CDGH_SAVE
- /* Rounds 0-3 */
- movdqu 0*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP0
- paddd 0*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 4-7 */
- movdqu 1*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP1
- paddd 1*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 8-11 */
- movdqu 2*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP2
- paddd 2*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 12-15 */
- movdqu 3*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP3
- paddd 3*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 16-19 */
- movdqa MSGTMP0, MSG
- paddd 4*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 20-23 */
- movdqa MSGTMP1, MSG
- paddd 5*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 24-27 */
- movdqa MSGTMP2, MSG
- paddd 6*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 28-31 */
- movdqa MSGTMP3, MSG
- paddd 7*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 32-35 */
- movdqa MSGTMP0, MSG
- paddd 8*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 36-39 */
- movdqa MSGTMP1, MSG
- paddd 9*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 40-43 */
- movdqa MSGTMP2, MSG
- paddd 10*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 44-47 */
- movdqa MSGTMP3, MSG
- paddd 11*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 48-51 */
- movdqa MSGTMP0, MSG
- paddd 12*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 52-55 */
- movdqa MSGTMP1, MSG
- paddd 13*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 56-59 */
- movdqa MSGTMP2, MSG
- paddd 14*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 60-63 */
- movdqa MSGTMP3, MSG
- paddd 15*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
+.irp i, 0, 16, 32, 48
+ do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
+ do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
+ do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
+ do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
+.endr
/* Add current hash values with previously saved */
paddd ABEF_SAVE, STATE0
@@ -315,14 +160,14 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
jne .Lloop0
/* Write hash values back in the correct order */
- pshufd $0x1B, STATE0, STATE0 /* FEBA */
- pshufd $0xB1, STATE1, STATE1 /* DCHG */
- movdqa STATE0, MSGTMP4
- pblendw $0xF0, STATE1, STATE0 /* DCBA */
- palignr $8, MSGTMP4, STATE1 /* HGFE */
-
- movdqu STATE0, 0*16(DIGEST_PTR)
- movdqu STATE1, 1*16(DIGEST_PTR)
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* GHEF */
+ punpckhqdq TMP, STATE1 /* ABCD */
+ pshufd $0xB1, STATE0, STATE0 /* HGFE */
+ pshufd $0x1B, STATE1, STATE1 /* DCBA */
+
+ movdqu STATE1, 0*16(DIGEST_PTR)
+ movdqu STATE0, 1*16(DIGEST_PTR)
.Ldone_hash: