diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
commit | 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch) | |
tree | 848558de17fb3008cdf4d861b01ac7781903ce39 /arch/arm64/crypto | |
parent | Initial commit. (diff) | |
download | linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip |
Adding upstream version 6.1.76.upstream/6.1.76upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/arm64/crypto')
49 files changed, 15352 insertions, 0 deletions
diff --git a/arch/arm64/crypto/.gitignore b/arch/arm64/crypto/.gitignore new file mode 100644 index 000000000..fcf2d731e --- /dev/null +++ b/arch/arm64/crypto/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +sha256-core.S +sha512-core.S +poly1305-core.S diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig new file mode 100644 index 000000000..4b121dc0c --- /dev/null +++ b/arch/arm64/crypto/Kconfig @@ -0,0 +1,293 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "Accelerated Cryptographic Algorithms for CPU (arm64)" + +config CRYPTO_GHASH_ARM64_CE + tristate "Hash functions: GHASH (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_GF128MUL + select CRYPTO_LIB_AES + select CRYPTO_AEAD + help + GCM GHASH function (NIST SP800-38D) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + +config CRYPTO_NHPOLY1305_NEON + tristate "Hash functions: NHPoly1305 (NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_NHPOLY1305 + help + NHPoly1305 hash function (Adiantum) + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + +config CRYPTO_POLY1305_NEON + tristate "Hash functions: Poly1305 (NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + help + Poly1305 authenticator algorithm (RFC7539) + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + +config CRYPTO_SHA1_ARM64_CE + tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SHA1 + help + SHA-1 secure hash algorithm (FIPS 180) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + +config CRYPTO_SHA256_ARM64 + tristate "Hash functions: SHA-224 and SHA-256" + select CRYPTO_HASH + help + SHA-224 and SHA-256 secure hash algorithms (FIPS 180) + + Architecture: arm64 + +config CRYPTO_SHA2_ARM64_CE + tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SHA256_ARM64 + help + SHA-224 and SHA-256 secure hash algorithms (FIPS 180) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + +config CRYPTO_SHA512_ARM64 + tristate "Hash functions: SHA-384 and SHA-512" + select CRYPTO_HASH + help + SHA-384 and SHA-512 secure hash algorithms (FIPS 180) + + Architecture: arm64 + +config CRYPTO_SHA512_ARM64_CE + tristate "Hash functions: SHA-384 and SHA-512 (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SHA512_ARM64 + help + SHA-384 and SHA-512 secure hash algorithms (FIPS 180) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + +config CRYPTO_SHA3_ARM64 + tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SHA3 + help + SHA-3 secure hash algorithms (FIPS 202) + + Architecture: arm64 using: + - ARMv8.2 Crypto Extensions + +config CRYPTO_SM3_NEON + tristate "Hash functions: SM3 (NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SM3 + help + SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012) + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + +config CRYPTO_SM3_ARM64_CE + tristate "Hash functions: SM3 (ARMv8.2 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SM3 + help + SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012) + + Architecture: arm64 using: + - ARMv8.2 Crypto Extensions + +config CRYPTO_POLYVAL_ARM64_CE + tristate "Hash functions: POLYVAL (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_POLYVAL + help + POLYVAL hash function for HCTR2 + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + +config CRYPTO_AES_ARM64 + tristate "Ciphers: AES, modes: ECB, CBC, CTR, CTS, XCTR, XTS" + select CRYPTO_AES + help + Block ciphers: AES cipher algorithms (FIPS-197) + Length-preserving ciphers: AES with ECB, CBC, CTR, CTS, + XCTR, and XTS modes + AEAD cipher: AES with CBC, ESSIV, and SHA-256 + for fscrypt and dm-crypt + + Architecture: arm64 + +config CRYPTO_AES_ARM64_CE + tristate "Ciphers: AES (ARMv8 Crypto Extensions)" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_LIB_AES + help + Block ciphers: AES cipher algorithms (FIPS-197) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + +config CRYPTO_AES_ARM64_CE_BLK + tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_SKCIPHER + select CRYPTO_AES_ARM64_CE + help + Length-preserving ciphers: AES cipher algorithms (FIPS-197) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E + and IEEE 1619) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + +config CRYPTO_AES_ARM64_NEON_BLK + tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_SKCIPHER + select CRYPTO_LIB_AES + help + Length-preserving ciphers: AES cipher algorithms (FIPS-197) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E + and IEEE 1619) + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + +config CRYPTO_CHACHA20_NEON + tristate "Ciphers: ChaCha (NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_SKCIPHER + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + help + Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 + stream cipher algorithms + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + +config CRYPTO_AES_ARM64_BS + tristate "Ciphers: AES, modes: ECB/CBC/CTR/XCTR/XTS modes (bit-sliced NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_SKCIPHER + select CRYPTO_AES_ARM64_NEON_BLK + select CRYPTO_LIB_AES + help + Length-preserving ciphers: AES cipher algorithms (FIPS-197) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + - XCTR mode for HCTR2 + - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E + and IEEE 1619) + + Architecture: arm64 using: + - bit-sliced algorithm + - NEON (Advanced SIMD) extensions + +config CRYPTO_SM4_ARM64_CE + tristate "Ciphers: SM4 (ARMv8.2 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_SM4 + help + Block ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016) + + Architecture: arm64 using: + - ARMv8.2 Crypto Extensions + - NEON (Advanced SIMD) extensions + +config CRYPTO_SM4_ARM64_CE_BLK + tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_SKCIPHER + select CRYPTO_SM4 + help + Length-preserving ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CFB (Cipher Feedback) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + - NEON (Advanced SIMD) extensions + +config CRYPTO_SM4_ARM64_NEON_BLK + tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR (NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_SKCIPHER + select CRYPTO_SM4 + help + Length-preserving ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016) + with block cipher modes: + - ECB (Electronic Codebook) mode (NIST SP800-38A) + - CBC (Cipher Block Chaining) mode (NIST SP800-38A) + - CFB (Cipher Feedback) mode (NIST SP800-38A) + - CTR (Counter) mode (NIST SP800-38A) + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + +config CRYPTO_AES_ARM64_CE_CCM + tristate "AEAD cipher: AES in CCM mode (ARMv8 Crypto Extensions)" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_AES_ARM64_CE + select CRYPTO_AEAD + select CRYPTO_LIB_AES + help + AEAD cipher: AES cipher algorithms (FIPS-197) with + CCM (Counter with Cipher Block Chaining-Message Authentication Code) + authenticated encryption mode (NIST SP800-38C) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + - NEON (Advanced SIMD) extensions + +config CRYPTO_CRCT10DIF_ARM64_CE + tristate "CRCT10DIF (PMULL)" + depends on KERNEL_MODE_NEON && CRC_T10DIF + select CRYPTO_HASH + help + CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF) + + Architecture: arm64 using + - PMULL (Polynomial Multiply Long) instructions + +endmenu + diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile new file mode 100644 index 000000000..087f1625e --- /dev/null +++ b/arch/arm64/crypto/Makefile @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# linux/arch/arm64/crypto/Makefile +# +# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> +# + +obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o +sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o + +obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o +sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o + +obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o +sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o + +obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o +sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o + +obj-$(CONFIG_CRYPTO_SM3_NEON) += sm3-neon.o +sm3-neon-y := sm3-neon-glue.o sm3-neon-core.o + +obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o +sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o + +obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce-cipher.o +sm4-ce-cipher-y := sm4-ce-cipher-glue.o sm4-ce-cipher-core.o + +obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_BLK) += sm4-ce.o +sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o + +obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o +sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o + +obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o +ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o + +obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o +polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o + +obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o +crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o +aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o +aes-ce-blk-y := aes-glue-ce.o aes-ce.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o +aes-neon-blk-y := aes-glue-neon.o aes-neon.o + +obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o +sha256-arm64-y := sha256-glue.o sha256-core.o + +obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o +sha512-arm64-y := sha512-glue.o sha512-core.o + +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o + +obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o +poly1305-neon-y := poly1305-core.o poly1305-glue.o +AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64 + +obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o +nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o + +obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o +aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o +aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o + +CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS + +$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE + $(call if_changed_rule,cc_o_c) + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) void $(@) + +$(obj)/%-core.S: $(src)/%-armv8.pl + $(call cmd,perlasm) + +$(obj)/sha256-core.S: $(src)/sha512-armv8.pl + $(call cmd,perlasm) + +clean-files += poly1305-core.S sha256-core.S sha512-core.S diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S new file mode 100644 index 000000000..b03f7f71f --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -0,0 +1,221 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions + * + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + /* + * u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, + * u32 macp, u8 const rk[], u32 rounds); + */ +SYM_FUNC_START(ce_aes_ccm_auth_data) + ld1 {v0.16b}, [x0] /* load mac */ + cbz w3, 1f + sub w3, w3, #16 + eor v1.16b, v1.16b, v1.16b +0: ldrb w7, [x1], #1 /* get 1 byte of input */ + subs w2, w2, #1 + add w3, w3, #1 + ins v1.b[0], w7 + ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ + beq 8f /* out of input? */ + cbnz w3, 0b + eor v0.16b, v0.16b, v1.16b +1: ld1 {v3.4s}, [x4] /* load first round key */ + prfm pldl1strm, [x1] + cmp w5, #12 /* which key size? */ + add x6, x4, #16 + sub w7, w5, #2 /* modified # of rounds */ + bmi 2f + bne 5f + mov v5.16b, v3.16b + b 4f +2: mov v4.16b, v3.16b + ld1 {v5.4s}, [x6], #16 /* load 2nd round key */ +3: aese v0.16b, v4.16b + aesmc v0.16b, v0.16b +4: ld1 {v3.4s}, [x6], #16 /* load next round key */ + aese v0.16b, v5.16b + aesmc v0.16b, v0.16b +5: ld1 {v4.4s}, [x6], #16 /* load next round key */ + subs w7, w7, #3 + aese v0.16b, v3.16b + aesmc v0.16b, v0.16b + ld1 {v5.4s}, [x6], #16 /* load next round key */ + bpl 3b + aese v0.16b, v4.16b + subs w2, w2, #16 /* last data? */ + eor v0.16b, v0.16b, v5.16b /* final round */ + bmi 6f + ld1 {v1.16b}, [x1], #16 /* load next input block */ + eor v0.16b, v0.16b, v1.16b /* xor with mac */ + bne 1b +6: st1 {v0.16b}, [x0] /* store mac */ + beq 10f + adds w2, w2, #16 + beq 10f + mov w3, w2 +7: ldrb w7, [x1], #1 + umov w6, v0.b[0] + eor w6, w6, w7 + strb w6, [x0], #1 + subs w2, w2, #1 + beq 10f + ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ + b 7b +8: cbz w3, 91f + mov w7, w3 + add w3, w3, #16 +9: ext v1.16b, v1.16b, v1.16b, #1 + adds w7, w7, #1 + bne 9b +91: eor v0.16b, v0.16b, v1.16b + st1 {v0.16b}, [x0] +10: mov w0, w3 + ret +SYM_FUNC_END(ce_aes_ccm_auth_data) + + /* + * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], + * u32 rounds); + */ +SYM_FUNC_START(ce_aes_ccm_final) + ld1 {v3.4s}, [x2], #16 /* load first round key */ + ld1 {v0.16b}, [x0] /* load mac */ + cmp w3, #12 /* which key size? */ + sub w3, w3, #2 /* modified # of rounds */ + ld1 {v1.16b}, [x1] /* load 1st ctriv */ + bmi 0f + bne 3f + mov v5.16b, v3.16b + b 2f +0: mov v4.16b, v3.16b +1: ld1 {v5.4s}, [x2], #16 /* load next round key */ + aese v0.16b, v4.16b + aesmc v0.16b, v0.16b + aese v1.16b, v4.16b + aesmc v1.16b, v1.16b +2: ld1 {v3.4s}, [x2], #16 /* load next round key */ + aese v0.16b, v5.16b + aesmc v0.16b, v0.16b + aese v1.16b, v5.16b + aesmc v1.16b, v1.16b +3: ld1 {v4.4s}, [x2], #16 /* load next round key */ + subs w3, w3, #3 + aese v0.16b, v3.16b + aesmc v0.16b, v0.16b + aese v1.16b, v3.16b + aesmc v1.16b, v1.16b + bpl 1b + aese v0.16b, v4.16b + aese v1.16b, v4.16b + /* final round key cancels out */ + eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ + st1 {v0.16b}, [x0] /* store result */ + ret +SYM_FUNC_END(ce_aes_ccm_final) + + .macro aes_ccm_do_crypt,enc + cbz x2, 5f + ldr x8, [x6, #8] /* load lower ctr */ + ld1 {v0.16b}, [x5] /* load mac */ +CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ +0: /* outer loop */ + ld1 {v1.8b}, [x6] /* load upper ctr */ + prfm pldl1strm, [x1] + add x8, x8, #1 + rev x9, x8 + cmp w4, #12 /* which key size? */ + sub w7, w4, #2 /* get modified # of rounds */ + ins v1.d[1], x9 /* no carry in lower ctr */ + ld1 {v3.4s}, [x3] /* load first round key */ + add x10, x3, #16 + bmi 1f + bne 4f + mov v5.16b, v3.16b + b 3f +1: mov v4.16b, v3.16b + ld1 {v5.4s}, [x10], #16 /* load 2nd round key */ +2: /* inner loop: 3 rounds, 2x interleaved */ + aese v0.16b, v4.16b + aesmc v0.16b, v0.16b + aese v1.16b, v4.16b + aesmc v1.16b, v1.16b +3: ld1 {v3.4s}, [x10], #16 /* load next round key */ + aese v0.16b, v5.16b + aesmc v0.16b, v0.16b + aese v1.16b, v5.16b + aesmc v1.16b, v1.16b +4: ld1 {v4.4s}, [x10], #16 /* load next round key */ + subs w7, w7, #3 + aese v0.16b, v3.16b + aesmc v0.16b, v0.16b + aese v1.16b, v3.16b + aesmc v1.16b, v1.16b + ld1 {v5.4s}, [x10], #16 /* load next round key */ + bpl 2b + aese v0.16b, v4.16b + aese v1.16b, v4.16b + subs w2, w2, #16 + bmi 6f /* partial block? */ + ld1 {v2.16b}, [x1], #16 /* load next input block */ + .if \enc == 1 + eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ + eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ + .else + eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ + eor v1.16b, v2.16b, v5.16b /* final round enc */ + .endif + eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ + st1 {v1.16b}, [x0], #16 /* write output block */ + bne 0b +CPU_LE( rev x8, x8 ) + st1 {v0.16b}, [x5] /* store mac */ + str x8, [x6, #8] /* store lsb end of ctr (BE) */ +5: ret + +6: eor v0.16b, v0.16b, v5.16b /* final round mac */ + eor v1.16b, v1.16b, v5.16b /* final round enc */ + st1 {v0.16b}, [x5] /* store mac */ + add w2, w2, #16 /* process partial tail block */ +7: ldrb w9, [x1], #1 /* get 1 byte of input */ + umov w6, v1.b[0] /* get top crypted ctr byte */ + umov w7, v0.b[0] /* get top mac byte */ + .if \enc == 1 + eor w7, w7, w9 + eor w9, w9, w6 + .else + eor w9, w9, w6 + eor w7, w7, w9 + .endif + strb w9, [x0], #1 /* store out byte */ + strb w7, [x5], #1 /* store mac byte */ + subs w2, w2, #1 + beq 5b + ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ + ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ + b 7b + .endm + + /* + * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, + * u8 const rk[], u32 rounds, u8 mac[], + * u8 ctr[]); + * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, + * u8 const rk[], u32 rounds, u8 mac[], + * u8 ctr[]); + */ +SYM_FUNC_START(ce_aes_ccm_encrypt) + aes_ccm_do_crypt 1 +SYM_FUNC_END(ce_aes_ccm_encrypt) + +SYM_FUNC_START(ce_aes_ccm_decrypt) + aes_ccm_do_crypt 0 +SYM_FUNC_END(ce_aes_ccm_decrypt) diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c new file mode 100644 index 000000000..c4f14415f --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions + * + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <asm/neon.h> +#include <asm/unaligned.h> +#include <crypto/aes.h> +#include <crypto/scatterwalk.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <linux/module.h> + +#include "aes-ce-setkey.h" + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ + /* + * # of rounds specified by AES: + * 128 bit key 10 rounds + * 192 bit key 12 rounds + * 256 bit key 14 rounds + * => n byte key => 6 + (n/4) rounds + */ + return 6 + ctx->key_length / 4; +} + +asmlinkage u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, + u32 macp, u32 const rk[], u32 rounds); + +asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, + u32 const rk[], u32 rounds, u8 mac[], + u8 ctr[]); + +asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, + u32 const rk[], u32 rounds, u8 mac[], + u8 ctr[]); + +asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], + u32 rounds); + +static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm); + + return ce_aes_expandkey(ctx, in_key, key_len); +} + +static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + if ((authsize & 1) || authsize < 4) + return -EINVAL; + return 0; +} + +static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8]; + u32 l = req->iv[0] + 1; + + /* verify that CCM dimension 'L' is set correctly in the IV */ + if (l < 2 || l > 8) + return -EINVAL; + + /* verify that msglen can in fact be represented in L bytes */ + if (l < 4 && msglen >> (8 * l)) + return -EOVERFLOW; + + /* + * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi + * uses a u32 type to represent msglen so the top 4 bytes are always 0. + */ + n[0] = 0; + n[1] = cpu_to_be32(msglen); + + memcpy(maciv, req->iv, AES_BLOCK_SIZE - l); + + /* + * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C) + * - bits 0..2 : max # of bytes required to represent msglen, minus 1 + * (already set by caller) + * - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc) + * - bit 6 : indicates presence of authenticate-only data + */ + maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2; + if (req->assoclen) + maciv[0] |= 0x40; + + memset(&req->iv[AES_BLOCK_SIZE - l], 0, l); + return 0; +} + +static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); + struct __packed { __be16 l; __be32 h; u16 len; } ltag; + struct scatter_walk walk; + u32 len = req->assoclen; + u32 macp = 0; + + /* prepend the AAD with a length tag */ + if (len < 0xff00) { + ltag.l = cpu_to_be16(len); + ltag.len = 2; + } else { + ltag.l = cpu_to_be16(0xfffe); + put_unaligned_be32(len, <ag.h); + ltag.len = 6; + } + + macp = ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, macp, + ctx->key_enc, num_rounds(ctx)); + scatterwalk_start(&walk, req->src); + + do { + u32 n = scatterwalk_clamp(&walk, len); + u8 *p; + + if (!n) { + scatterwalk_start(&walk, sg_next(walk.sg)); + n = scatterwalk_clamp(&walk, len); + } + n = min_t(u32, n, SZ_4K); /* yield NEON at least every 4k */ + p = scatterwalk_map(&walk); + + macp = ce_aes_ccm_auth_data(mac, p, n, macp, ctx->key_enc, + num_rounds(ctx)); + + if (len / SZ_4K > (len - n) / SZ_4K) { + kernel_neon_end(); + kernel_neon_begin(); + } + len -= n; + + scatterwalk_unmap(p); + scatterwalk_advance(&walk, n); + scatterwalk_done(&walk, 0, len); + } while (len); +} + +static int ccm_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); + struct skcipher_walk walk; + u8 __aligned(8) mac[AES_BLOCK_SIZE]; + u8 buf[AES_BLOCK_SIZE]; + u32 len = req->cryptlen; + int err; + + err = ccm_init_mac(req, mac, len); + if (err) + return err; + + /* preserve the original iv for the final round */ + memcpy(buf, req->iv, AES_BLOCK_SIZE); + + err = skcipher_walk_aead_encrypt(&walk, req, false); + if (unlikely(err)) + return err; + + kernel_neon_begin(); + + if (req->assoclen) + ccm_calculate_auth_mac(req, mac); + + do { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; + + if (walk.nbytes == walk.total) + tail = 0; + + ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); + + if (walk.nbytes == walk.total) + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + + kernel_neon_end(); + + if (walk.nbytes) { + err = skcipher_walk_done(&walk, tail); + if (unlikely(err)) + return err; + if (unlikely(walk.nbytes)) + kernel_neon_begin(); + } + } while (walk.nbytes); + + /* copy authtag to end of dst */ + scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int ccm_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); + unsigned int authsize = crypto_aead_authsize(aead); + struct skcipher_walk walk; + u8 __aligned(8) mac[AES_BLOCK_SIZE]; + u8 buf[AES_BLOCK_SIZE]; + u32 len = req->cryptlen - authsize; + int err; + + err = ccm_init_mac(req, mac, len); + if (err) + return err; + + /* preserve the original iv for the final round */ + memcpy(buf, req->iv, AES_BLOCK_SIZE); + + err = skcipher_walk_aead_decrypt(&walk, req, false); + if (unlikely(err)) + return err; + + kernel_neon_begin(); + + if (req->assoclen) + ccm_calculate_auth_mac(req, mac); + + do { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; + + if (walk.nbytes == walk.total) + tail = 0; + + ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); + + if (walk.nbytes == walk.total) + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + + kernel_neon_end(); + + if (walk.nbytes) { + err = skcipher_walk_done(&walk, tail); + if (unlikely(err)) + return err; + if (unlikely(walk.nbytes)) + kernel_neon_begin(); + } + } while (walk.nbytes); + + /* compare calculated auth tag with the stored one */ + scatterwalk_map_and_copy(buf, req->src, + req->assoclen + req->cryptlen - authsize, + authsize, 0); + + if (crypto_memneq(mac, buf, authsize)) + return -EBADMSG; + return 0; +} + +static struct aead_alg ccm_aes_alg = { + .base = { + .cra_name = "ccm(aes)", + .cra_driver_name = "ccm-aes-ce", + .cra_priority = 300, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + }, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .maxauthsize = AES_BLOCK_SIZE, + .setkey = ccm_setkey, + .setauthsize = ccm_setauthsize, + .encrypt = ccm_encrypt, + .decrypt = ccm_decrypt, +}; + +static int __init aes_mod_init(void) +{ + if (!cpu_have_named_feature(AES)) + return -ENODEV; + return crypto_register_aead(&ccm_aes_alg); +} + +static void __exit aes_mod_exit(void) +{ + crypto_unregister_aead(&ccm_aes_alg); +} + +module_init(aes_mod_init); +module_exit(aes_mod_exit); + +MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("ccm(aes)"); diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S new file mode 100644 index 000000000..e52e13eb8 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-core.S @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .arch armv8-a+crypto + +SYM_FUNC_START(__aes_ce_encrypt) + sub w3, w3, #2 + ld1 {v0.16b}, [x2] + ld1 {v1.4s}, [x0], #16 + cmp w3, #10 + bmi 0f + bne 3f + mov v3.16b, v1.16b + b 2f +0: mov v2.16b, v1.16b + ld1 {v3.4s}, [x0], #16 +1: aese v0.16b, v2.16b + aesmc v0.16b, v0.16b +2: ld1 {v1.4s}, [x0], #16 + aese v0.16b, v3.16b + aesmc v0.16b, v0.16b +3: ld1 {v2.4s}, [x0], #16 + subs w3, w3, #3 + aese v0.16b, v1.16b + aesmc v0.16b, v0.16b + ld1 {v3.4s}, [x0], #16 + bpl 1b + aese v0.16b, v2.16b + eor v0.16b, v0.16b, v3.16b + st1 {v0.16b}, [x1] + ret +SYM_FUNC_END(__aes_ce_encrypt) + +SYM_FUNC_START(__aes_ce_decrypt) + sub w3, w3, #2 + ld1 {v0.16b}, [x2] + ld1 {v1.4s}, [x0], #16 + cmp w3, #10 + bmi 0f + bne 3f + mov v3.16b, v1.16b + b 2f +0: mov v2.16b, v1.16b + ld1 {v3.4s}, [x0], #16 +1: aesd v0.16b, v2.16b + aesimc v0.16b, v0.16b +2: ld1 {v1.4s}, [x0], #16 + aesd v0.16b, v3.16b + aesimc v0.16b, v0.16b +3: ld1 {v2.4s}, [x0], #16 + subs w3, w3, #3 + aesd v0.16b, v1.16b + aesimc v0.16b, v0.16b + ld1 {v3.4s}, [x0], #16 + bpl 1b + aesd v0.16b, v2.16b + eor v0.16b, v0.16b, v3.16b + st1 {v0.16b}, [x1] + ret +SYM_FUNC_END(__aes_ce_decrypt) + +/* + * __aes_ce_sub() - use the aese instruction to perform the AES sbox + * substitution on each byte in 'input' + */ +SYM_FUNC_START(__aes_ce_sub) + dup v1.4s, w0 + movi v0.16b, #0 + aese v0.16b, v1.16b + umov w0, v0.s[0] + ret +SYM_FUNC_END(__aes_ce_sub) + +SYM_FUNC_START(__aes_ce_invert) + ld1 {v0.4s}, [x1] + aesimc v1.16b, v0.16b + st1 {v1.4s}, [x0] + ret +SYM_FUNC_END(__aes_ce_invert) diff --git a/arch/arm64/crypto/aes-ce-glue.c b/arch/arm64/crypto/aes-ce-glue.c new file mode 100644 index 000000000..56a5f6f0b --- /dev/null +++ b/arch/arm64/crypto/aes-ce-glue.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions + * + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/aes.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +#include "aes-ce-setkey.h" + +MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +struct aes_block { + u8 b[AES_BLOCK_SIZE]; +}; + +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + +asmlinkage u32 __aes_ce_sub(u32 l); +asmlinkage void __aes_ce_invert(struct aes_block *out, + const struct aes_block *in); + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ + /* + * # of rounds specified by AES: + * 128 bit key 10 rounds + * 192 bit key 12 rounds + * 256 bit key 14 rounds + * => n byte key => 6 + (n/4) rounds + */ + return 6 + ctx->key_length / 4; +} + +static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + if (!crypto_simd_usable()) { + aes_encrypt(ctx, dst, src); + return; + } + + kernel_neon_begin(); + __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx)); + kernel_neon_end(); +} + +static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + if (!crypto_simd_usable()) { + aes_decrypt(ctx, dst, src); + return; + } + + kernel_neon_begin(); + __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx)); + kernel_neon_end(); +} + +int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, + unsigned int key_len) +{ + /* + * The AES key schedule round constants + */ + static u8 const rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, + }; + + u32 kwords = key_len / sizeof(u32); + struct aes_block *key_enc, *key_dec; + int i, j; + + if (key_len != AES_KEYSIZE_128 && + key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256) + return -EINVAL; + + ctx->key_length = key_len; + for (i = 0; i < kwords; i++) + ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32)); + + kernel_neon_begin(); + for (i = 0; i < sizeof(rcon); i++) { + u32 *rki = ctx->key_enc + (i * kwords); + u32 *rko = rki + kwords; + + rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0]; + rko[1] = rko[0] ^ rki[1]; + rko[2] = rko[1] ^ rki[2]; + rko[3] = rko[2] ^ rki[3]; + + if (key_len == AES_KEYSIZE_192) { + if (i >= 7) + break; + rko[4] = rko[3] ^ rki[4]; + rko[5] = rko[4] ^ rki[5]; + } else if (key_len == AES_KEYSIZE_256) { + if (i >= 6) + break; + rko[4] = __aes_ce_sub(rko[3]) ^ rki[4]; + rko[5] = rko[4] ^ rki[5]; + rko[6] = rko[5] ^ rki[6]; + rko[7] = rko[6] ^ rki[7]; + } + } + + /* + * Generate the decryption keys for the Equivalent Inverse Cipher. + * This involves reversing the order of the round keys, and applying + * the Inverse Mix Columns transformation on all but the first and + * the last one. + */ + key_enc = (struct aes_block *)ctx->key_enc; + key_dec = (struct aes_block *)ctx->key_dec; + j = num_rounds(ctx); + + key_dec[0] = key_enc[j]; + for (i = 1, j--; j > 0; i++, j--) + __aes_ce_invert(key_dec + i, key_enc + j); + key_dec[i] = key_enc[0]; + + kernel_neon_end(); + return 0; +} +EXPORT_SYMBOL(ce_aes_expandkey); + +int ce_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + return ce_aes_expandkey(ctx, in_key, key_len); +} +EXPORT_SYMBOL(ce_aes_setkey); + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_driver_name = "aes-ce", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + .cra_cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = ce_aes_setkey, + .cia_encrypt = aes_cipher_encrypt, + .cia_decrypt = aes_cipher_decrypt + } +}; + +static int __init aes_mod_init(void) +{ + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_mod_exit(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_cpu_feature_match(AES, aes_mod_init); +module_exit(aes_mod_exit); diff --git a/arch/arm64/crypto/aes-ce-setkey.h b/arch/arm64/crypto/aes-ce-setkey.h new file mode 100644 index 000000000..fd9ecf07d --- /dev/null +++ b/arch/arm64/crypto/aes-ce-setkey.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +int ce_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len); +int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, + unsigned int key_len); diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S new file mode 100644 index 000000000..1dc5bbbfe --- /dev/null +++ b/arch/arm64/crypto/aes-ce.S @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with + * Crypto Extensions + * + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +#define AES_FUNC_START(func) SYM_FUNC_START(ce_ ## func) +#define AES_FUNC_END(func) SYM_FUNC_END(ce_ ## func) + + .arch armv8-a+crypto + + xtsmask .req v16 + cbciv .req v16 + vctr .req v16 + + .macro xts_reload_mask, tmp + .endm + + .macro xts_cts_skip_tw, reg, lbl + .endm + + /* preload all round keys */ + .macro load_round_keys, rounds, rk + cmp \rounds, #12 + blo 2222f /* 128 bits */ + beq 1111f /* 192 bits */ + ld1 {v17.4s-v18.4s}, [\rk], #32 +1111: ld1 {v19.4s-v20.4s}, [\rk], #32 +2222: ld1 {v21.4s-v24.4s}, [\rk], #64 + ld1 {v25.4s-v28.4s}, [\rk], #64 + ld1 {v29.4s-v31.4s}, [\rk] + .endm + + /* prepare for encryption with key in rk[] */ + .macro enc_prepare, rounds, rk, temp + mov \temp, \rk + load_round_keys \rounds, \temp + .endm + + /* prepare for encryption (again) but with new key in rk[] */ + .macro enc_switch_key, rounds, rk, temp + mov \temp, \rk + load_round_keys \rounds, \temp + .endm + + /* prepare for decryption with key in rk[] */ + .macro dec_prepare, rounds, rk, temp + mov \temp, \rk + load_round_keys \rounds, \temp + .endm + + .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4 + aes\de \i0\().16b, \k\().16b + aes\mc \i0\().16b, \i0\().16b + .ifnb \i1 + aes\de \i1\().16b, \k\().16b + aes\mc \i1\().16b, \i1\().16b + .ifnb \i3 + aes\de \i2\().16b, \k\().16b + aes\mc \i2\().16b, \i2\().16b + aes\de \i3\().16b, \k\().16b + aes\mc \i3\().16b, \i3\().16b + .ifnb \i4 + aes\de \i4\().16b, \k\().16b + aes\mc \i4\().16b, \i4\().16b + .endif + .endif + .endif + .endm + + /* up to 5 interleaved encryption rounds with the same round key */ + .macro round_Nx, enc, k, i0, i1, i2, i3, i4 + .ifc \enc, e + do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3, \i4 + .else + do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3, \i4 + .endif + .endm + + /* up to 5 interleaved final rounds */ + .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4 + aes\de \i0\().16b, \k\().16b + .ifnb \i1 + aes\de \i1\().16b, \k\().16b + .ifnb \i3 + aes\de \i2\().16b, \k\().16b + aes\de \i3\().16b, \k\().16b + .ifnb \i4 + aes\de \i4\().16b, \k\().16b + .endif + .endif + .endif + eor \i0\().16b, \i0\().16b, \k2\().16b + .ifnb \i1 + eor \i1\().16b, \i1\().16b, \k2\().16b + .ifnb \i3 + eor \i2\().16b, \i2\().16b, \k2\().16b + eor \i3\().16b, \i3\().16b, \k2\().16b + .ifnb \i4 + eor \i4\().16b, \i4\().16b, \k2\().16b + .endif + .endif + .endif + .endm + + /* up to 5 interleaved blocks */ + .macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4 + cmp \rounds, #12 + blo 2222f /* 128 bits */ + beq 1111f /* 192 bits */ + round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4 + round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4 +1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4 + round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4 +2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 + round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4 + .endr + fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4 + .endm + + .macro encrypt_block, in, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \in + .endm + + .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 + .endm + + .macro encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \i0, \i1, \i2, \i3, \i4 + .endm + + .macro decrypt_block, in, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \in + .endm + + .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 + .endm + + .macro decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \i0, \i1, \i2, \i3, \i4 + .endm + +#define MAX_STRIDE 5 + +#include "aes-modes.S" diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S new file mode 100644 index 000000000..c9d6955f8 --- /dev/null +++ b/arch/arm64/crypto/aes-cipher-core.S @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Scalar AES core transform + * + * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + + .text + + rk .req x0 + out .req x1 + in .req x2 + rounds .req x3 + tt .req x2 + + .macro __pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift + .ifc \op\shift, b0 + ubfiz \reg0, \in0, #2, #8 + ubfiz \reg1, \in1e, #2, #8 + .else + ubfx \reg0, \in0, #\shift, #8 + ubfx \reg1, \in1e, #\shift, #8 + .endif + + /* + * AArch64 cannot do byte size indexed loads from a table containing + * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a + * valid instruction. So perform the shift explicitly first for the + * high bytes (the low byte is shifted implicitly by using ubfiz rather + * than ubfx above) + */ + .ifnc \op, b + ldr \reg0, [tt, \reg0, uxtw #2] + ldr \reg1, [tt, \reg1, uxtw #2] + .else + .if \shift > 0 + lsl \reg0, \reg0, #2 + lsl \reg1, \reg1, #2 + .endif + ldrb \reg0, [tt, \reg0, uxtw] + ldrb \reg1, [tt, \reg1, uxtw] + .endif + .endm + + .macro __pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift + ubfx \reg0, \in0, #\shift, #8 + ubfx \reg1, \in1d, #\shift, #8 + ldr\op \reg0, [tt, \reg0, uxtw #\sz] + ldr\op \reg1, [tt, \reg1, uxtw #\sz] + .endm + + .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op + ldp \out0, \out1, [rk], #8 + + __pair\enc \sz, \op, w12, w13, \in0, \in1, \in3, 0 + __pair\enc \sz, \op, w14, w15, \in1, \in2, \in0, 8 + __pair\enc \sz, \op, w16, w17, \in2, \in3, \in1, 16 + __pair\enc \sz, \op, \t0, \t1, \in3, \in0, \in2, 24 + + eor \out0, \out0, w12 + eor \out1, \out1, w13 + eor \out0, \out0, w14, ror #24 + eor \out1, \out1, w15, ror #24 + eor \out0, \out0, w16, ror #16 + eor \out1, \out1, w17, ror #16 + eor \out0, \out0, \t0, ror #8 + eor \out1, \out1, \t1, ror #8 + .endm + + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op + .endm + + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op + .endm + + .macro do_crypt, round, ttab, ltab, bsz + ldp w4, w5, [in] + ldp w6, w7, [in, #8] + ldp w8, w9, [rk], #16 + ldp w10, w11, [rk, #-8] + +CPU_BE( rev w4, w4 ) +CPU_BE( rev w5, w5 ) +CPU_BE( rev w6, w6 ) +CPU_BE( rev w7, w7 ) + + eor w4, w4, w8 + eor w5, w5, w9 + eor w6, w6, w10 + eor w7, w7, w11 + + adr_l tt, \ttab + + tbnz rounds, #1, 1f + +0: \round w8, w9, w10, w11, w4, w5, w6, w7 + \round w4, w5, w6, w7, w8, w9, w10, w11 + +1: subs rounds, rounds, #4 + \round w8, w9, w10, w11, w4, w5, w6, w7 + b.ls 3f +2: \round w4, w5, w6, w7, w8, w9, w10, w11 + b 0b +3: adr_l tt, \ltab + \round w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b + +CPU_BE( rev w4, w4 ) +CPU_BE( rev w5, w5 ) +CPU_BE( rev w6, w6 ) +CPU_BE( rev w7, w7 ) + + stp w4, w5, [out] + stp w6, w7, [out, #8] + ret + .endm + +SYM_FUNC_START(__aes_arm64_encrypt) + do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 +SYM_FUNC_END(__aes_arm64_encrypt) + + .align 5 +SYM_FUNC_START(__aes_arm64_decrypt) + do_crypt iround, crypto_it_tab, crypto_aes_inv_sbox, 0 +SYM_FUNC_END(__aes_arm64_decrypt) diff --git a/arch/arm64/crypto/aes-cipher-glue.c b/arch/arm64/crypto/aes-cipher-glue.c new file mode 100644 index 000000000..8caf6dfef --- /dev/null +++ b/arch/arm64/crypto/aes-cipher-glue.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Scalar AES core transform + * + * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <crypto/aes.h> +#include <linux/crypto.h> +#include <linux/module.h> + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + +static void aes_arm64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + int rounds = 6 + ctx->key_length / 4; + + __aes_arm64_encrypt(ctx->key_enc, out, in, rounds); +} + +static void aes_arm64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + int rounds = 6 + ctx->key_length / 4; + + __aes_arm64_decrypt(ctx->key_dec, out, in, rounds); +} + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_driver_name = "aes-arm64", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + + .cra_cipher.cia_min_keysize = AES_MIN_KEY_SIZE, + .cra_cipher.cia_max_keysize = AES_MAX_KEY_SIZE, + .cra_cipher.cia_setkey = crypto_aes_set_key, + .cra_cipher.cia_encrypt = aes_arm64_encrypt, + .cra_cipher.cia_decrypt = aes_arm64_decrypt +}; + +static int __init aes_init(void) +{ + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_fini(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_init(aes_init); +module_exit(aes_fini); + +MODULE_DESCRIPTION("Scalar AES cipher for arm64"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("aes"); diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c new file mode 100644 index 000000000..162787c7a --- /dev/null +++ b/arch/arm64/crypto/aes-glue.c @@ -0,0 +1,1059 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES + * + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <asm/neon.h> +#include <asm/hwcap.h> +#include <asm/simd.h> +#include <crypto/aes.h> +#include <crypto/ctr.h> +#include <crypto/sha2.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <crypto/xts.h> + +#include "aes-ce-setkey.h" + +#ifdef USE_V8_CRYPTO_EXTENSIONS +#define MODE "ce" +#define PRIO 300 +#define aes_expandkey ce_aes_expandkey +#define aes_ecb_encrypt ce_aes_ecb_encrypt +#define aes_ecb_decrypt ce_aes_ecb_decrypt +#define aes_cbc_encrypt ce_aes_cbc_encrypt +#define aes_cbc_decrypt ce_aes_cbc_decrypt +#define aes_cbc_cts_encrypt ce_aes_cbc_cts_encrypt +#define aes_cbc_cts_decrypt ce_aes_cbc_cts_decrypt +#define aes_essiv_cbc_encrypt ce_aes_essiv_cbc_encrypt +#define aes_essiv_cbc_decrypt ce_aes_essiv_cbc_decrypt +#define aes_ctr_encrypt ce_aes_ctr_encrypt +#define aes_xctr_encrypt ce_aes_xctr_encrypt +#define aes_xts_encrypt ce_aes_xts_encrypt +#define aes_xts_decrypt ce_aes_xts_decrypt +#define aes_mac_update ce_aes_mac_update +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 Crypto Extensions"); +#else +#define MODE "neon" +#define PRIO 200 +#define aes_ecb_encrypt neon_aes_ecb_encrypt +#define aes_ecb_decrypt neon_aes_ecb_decrypt +#define aes_cbc_encrypt neon_aes_cbc_encrypt +#define aes_cbc_decrypt neon_aes_cbc_decrypt +#define aes_cbc_cts_encrypt neon_aes_cbc_cts_encrypt +#define aes_cbc_cts_decrypt neon_aes_cbc_cts_decrypt +#define aes_essiv_cbc_encrypt neon_aes_essiv_cbc_encrypt +#define aes_essiv_cbc_decrypt neon_aes_essiv_cbc_decrypt +#define aes_ctr_encrypt neon_aes_ctr_encrypt +#define aes_xctr_encrypt neon_aes_xctr_encrypt +#define aes_xts_encrypt neon_aes_xts_encrypt +#define aes_xts_decrypt neon_aes_xts_decrypt +#define aes_mac_update neon_aes_mac_update +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 NEON"); +#endif +#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS) +MODULE_ALIAS_CRYPTO("ecb(aes)"); +MODULE_ALIAS_CRYPTO("cbc(aes)"); +MODULE_ALIAS_CRYPTO("ctr(aes)"); +MODULE_ALIAS_CRYPTO("xts(aes)"); +MODULE_ALIAS_CRYPTO("xctr(aes)"); +#endif +MODULE_ALIAS_CRYPTO("cts(cbc(aes))"); +MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)"); +MODULE_ALIAS_CRYPTO("cmac(aes)"); +MODULE_ALIAS_CRYPTO("xcbc(aes)"); +MODULE_ALIAS_CRYPTO("cbcmac(aes)"); + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +/* defined in aes-modes.S */ +asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks); +asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks); + +asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks, u8 iv[]); +asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks, u8 iv[]); + +asmlinkage void aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 const iv[]); +asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 const iv[]); + +asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[]); + +asmlinkage void aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[], int byte_ctr); + +asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], + int rounds, int bytes, u32 const rk2[], u8 iv[], + int first); +asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], + int rounds, int bytes, u32 const rk2[], u8 iv[], + int first); + +asmlinkage void aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], + int rounds, int blocks, u8 iv[], + u32 const rk2[]); +asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], + int rounds, int blocks, u8 iv[], + u32 const rk2[]); + +asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds, + int blocks, u8 dg[], int enc_before, + int enc_after); + +struct crypto_aes_xts_ctx { + struct crypto_aes_ctx key1; + struct crypto_aes_ctx __aligned(8) key2; +}; + +struct crypto_aes_essiv_cbc_ctx { + struct crypto_aes_ctx key1; + struct crypto_aes_ctx __aligned(8) key2; + struct crypto_shash *hash; +}; + +struct mac_tfm_ctx { + struct crypto_aes_ctx key; + u8 __aligned(8) consts[]; +}; + +struct mac_desc_ctx { + unsigned int len; + u8 dg[AES_BLOCK_SIZE]; +}; + +static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + + return aes_expandkey(ctx, in_key, key_len); +} + +static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm, + const u8 *in_key, unsigned int key_len) +{ + struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int ret; + + ret = xts_verify_key(tfm, in_key, key_len); + if (ret) + return ret; + + ret = aes_expandkey(&ctx->key1, in_key, key_len / 2); + if (!ret) + ret = aes_expandkey(&ctx->key2, &in_key[key_len / 2], + key_len / 2); + return ret; +} + +static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm, + const u8 *in_key, + unsigned int key_len) +{ + struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + u8 digest[SHA256_DIGEST_SIZE]; + int ret; + + ret = aes_expandkey(&ctx->key1, in_key, key_len); + if (ret) + return ret; + + crypto_shash_tfm_digest(ctx->hash, in_key, key_len, digest); + + return aes_expandkey(&ctx->key2, digest, sizeof(digest)); +} + +static int __maybe_unused ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key_length / 4; + struct skcipher_walk walk; + unsigned int blocks; + + err = skcipher_walk_virt(&walk, req, false); + + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { + kernel_neon_begin(); + aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_enc, rounds, blocks); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); + } + return err; +} + +static int __maybe_unused ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key_length / 4; + struct skcipher_walk walk; + unsigned int blocks; + + err = skcipher_walk_virt(&walk, req, false); + + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { + kernel_neon_begin(); + aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_dec, rounds, blocks); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); + } + return err; +} + +static int cbc_encrypt_walk(struct skcipher_request *req, + struct skcipher_walk *walk) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err = 0, rounds = 6 + ctx->key_length / 4; + unsigned int blocks; + + while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) { + kernel_neon_begin(); + aes_cbc_encrypt(walk->dst.virt.addr, walk->src.virt.addr, + ctx->key_enc, rounds, blocks, walk->iv); + kernel_neon_end(); + err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE); + } + return err; +} + +static int __maybe_unused cbc_encrypt(struct skcipher_request *req) +{ + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; + return cbc_encrypt_walk(req, &walk); +} + +static int cbc_decrypt_walk(struct skcipher_request *req, + struct skcipher_walk *walk) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err = 0, rounds = 6 + ctx->key_length / 4; + unsigned int blocks; + + while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) { + kernel_neon_begin(); + aes_cbc_decrypt(walk->dst.virt.addr, walk->src.virt.addr, + ctx->key_dec, rounds, blocks, walk->iv); + kernel_neon_end(); + err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE); + } + return err; +} + +static int __maybe_unused cbc_decrypt(struct skcipher_request *req) +{ + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; + return cbc_decrypt_walk(req, &walk); +} + +static int cts_cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key_length / 4; + int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + struct scatterlist *src = req->src, *dst = req->dst; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct skcipher_walk walk; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), + NULL, NULL); + + if (req->cryptlen <= AES_BLOCK_SIZE) { + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + cbc_blocks = 1; + } + + if (cbc_blocks > 0) { + skcipher_request_set_crypt(&subreq, req->src, req->dst, + cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false) ?: + cbc_encrypt_walk(&subreq, &walk); + if (err) + return err; + + if (req->cryptlen == AES_BLOCK_SIZE) + return 0; + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, + subreq.cryptlen); + } + + /* handle ciphertext stealing */ + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_neon_begin(); + aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_enc, rounds, walk.nbytes, walk.iv); + kernel_neon_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int cts_cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key_length / 4; + int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + struct scatterlist *src = req->src, *dst = req->dst; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct skcipher_walk walk; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), + NULL, NULL); + + if (req->cryptlen <= AES_BLOCK_SIZE) { + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + cbc_blocks = 1; + } + + if (cbc_blocks > 0) { + skcipher_request_set_crypt(&subreq, req->src, req->dst, + cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false) ?: + cbc_decrypt_walk(&subreq, &walk); + if (err) + return err; + + if (req->cryptlen == AES_BLOCK_SIZE) + return 0; + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, + subreq.cryptlen); + } + + /* handle ciphertext stealing */ + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_neon_begin(); + aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_dec, rounds, walk.nbytes, walk.iv); + kernel_neon_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int __maybe_unused essiv_cbc_init_tfm(struct crypto_skcipher *tfm) +{ + struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + + ctx->hash = crypto_alloc_shash("sha256", 0, 0); + + return PTR_ERR_OR_ZERO(ctx->hash); +} + +static void __maybe_unused essiv_cbc_exit_tfm(struct crypto_skcipher *tfm) +{ + struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + + crypto_free_shash(ctx->hash); +} + +static int __maybe_unused essiv_cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key1.key_length / 4; + struct skcipher_walk walk; + unsigned int blocks; + + err = skcipher_walk_virt(&walk, req, false); + + blocks = walk.nbytes / AES_BLOCK_SIZE; + if (blocks) { + kernel_neon_begin(); + aes_essiv_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key1.key_enc, rounds, blocks, + req->iv, ctx->key2.key_enc); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); + } + return err ?: cbc_encrypt_walk(req, &walk); +} + +static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key1.key_length / 4; + struct skcipher_walk walk; + unsigned int blocks; + + err = skcipher_walk_virt(&walk, req, false); + + blocks = walk.nbytes / AES_BLOCK_SIZE; + if (blocks) { + kernel_neon_begin(); + aes_essiv_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key1.key_dec, rounds, blocks, + req->iv, ctx->key2.key_enc); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); + } + return err ?: cbc_decrypt_walk(req, &walk); +} + +static int __maybe_unused xctr_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key_length / 4; + struct skcipher_walk walk; + unsigned int byte_ctr = 0; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes > 0) { + const u8 *src = walk.src.virt.addr; + unsigned int nbytes = walk.nbytes; + u8 *dst = walk.dst.virt.addr; + u8 buf[AES_BLOCK_SIZE]; + + /* + * If given less than 16 bytes, we must copy the partial block + * into a temporary buffer of 16 bytes to avoid out of bounds + * reads and writes. Furthermore, this code is somewhat unusual + * in that it expects the end of the data to be at the end of + * the temporary buffer, rather than the start of the data at + * the start of the temporary buffer. + */ + if (unlikely(nbytes < AES_BLOCK_SIZE)) + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + else if (nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); + + kernel_neon_begin(); + aes_xctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes, + walk.iv, byte_ctr); + kernel_neon_end(); + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); + byte_ctr += nbytes; + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int __maybe_unused ctr_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, rounds = 6 + ctx->key_length / 4; + struct skcipher_walk walk; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes > 0) { + const u8 *src = walk.src.virt.addr; + unsigned int nbytes = walk.nbytes; + u8 *dst = walk.dst.virt.addr; + u8 buf[AES_BLOCK_SIZE]; + + /* + * If given less than 16 bytes, we must copy the partial block + * into a temporary buffer of 16 bytes to avoid out of bounds + * reads and writes. Furthermore, this code is somewhat unusual + * in that it expects the end of the data to be at the end of + * the temporary buffer, rather than the start of the data at + * the start of the temporary buffer. + */ + if (unlikely(nbytes < AES_BLOCK_SIZE)) + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + else if (nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); + + kernel_neon_begin(); + aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes, + walk.iv); + kernel_neon_end(); + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int __maybe_unused xts_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, first, rounds = 6 + ctx->key1.key_length / 4; + int tail = req->cryptlen % AES_BLOCK_SIZE; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct scatterlist *src, *dst; + struct skcipher_walk walk; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + err = skcipher_walk_virt(&walk, req, false); + + if (unlikely(tail > 0 && walk.nbytes < walk.total)) { + int xts_blocks = DIV_ROUND_UP(req->cryptlen, + AES_BLOCK_SIZE) - 2; + + skcipher_walk_abort(&walk); + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + skcipher_request_flags(req), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + xts_blocks * AES_BLOCK_SIZE, + req->iv); + req = &subreq; + err = skcipher_walk_virt(&walk, req, false); + } else { + tail = 0; + } + + for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) { + int nbytes = walk.nbytes; + + if (walk.nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); + + kernel_neon_begin(); + aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key1.key_enc, rounds, nbytes, + ctx->key2.key_enc, walk.iv, first); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + if (err || likely(!tail)) + return err; + + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_neon_begin(); + aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key1.key_enc, rounds, walk.nbytes, + ctx->key2.key_enc, walk.iv, first); + kernel_neon_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int __maybe_unused xts_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int err, first, rounds = 6 + ctx->key1.key_length / 4; + int tail = req->cryptlen % AES_BLOCK_SIZE; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct scatterlist *src, *dst; + struct skcipher_walk walk; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + err = skcipher_walk_virt(&walk, req, false); + + if (unlikely(tail > 0 && walk.nbytes < walk.total)) { + int xts_blocks = DIV_ROUND_UP(req->cryptlen, + AES_BLOCK_SIZE) - 2; + + skcipher_walk_abort(&walk); + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + skcipher_request_flags(req), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + xts_blocks * AES_BLOCK_SIZE, + req->iv); + req = &subreq; + err = skcipher_walk_virt(&walk, req, false); + } else { + tail = 0; + } + + for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) { + int nbytes = walk.nbytes; + + if (walk.nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); + + kernel_neon_begin(); + aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key1.key_dec, rounds, nbytes, + ctx->key2.key_enc, walk.iv, first); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + if (err || likely(!tail)) + return err; + + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + + kernel_neon_begin(); + aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key1.key_dec, rounds, walk.nbytes, + ctx->key2.key_enc, walk.iv, first); + kernel_neon_end(); + + return skcipher_walk_done(&walk, 0); +} + +static struct skcipher_alg aes_algs[] = { { +#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS) + .base = { + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-" MODE, + .cra_priority = PRIO, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, +}, { + .base = { + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-" MODE, + .cra_priority = PRIO, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, +}, { + .base = { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-" MODE, + .cra_priority = PRIO, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = ctr_encrypt, + .decrypt = ctr_encrypt, +}, { + .base = { + .cra_name = "xctr(aes)", + .cra_driver_name = "xctr-aes-" MODE, + .cra_priority = PRIO, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = xctr_encrypt, + .decrypt = xctr_encrypt, +}, { + .base = { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-" MODE, + .cra_priority = PRIO, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .walksize = 2 * AES_BLOCK_SIZE, + .setkey = xts_set_key, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, +}, { +#endif + .base = { + .cra_name = "cts(cbc(aes))", + .cra_driver_name = "cts-cbc-aes-" MODE, + .cra_priority = PRIO, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .walksize = 2 * AES_BLOCK_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = cts_cbc_encrypt, + .decrypt = cts_cbc_decrypt, +}, { + .base = { + .cra_name = "essiv(cbc(aes),sha256)", + .cra_driver_name = "essiv-cbc-aes-sha256-" MODE, + .cra_priority = PRIO + 1, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_essiv_cbc_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = essiv_cbc_set_key, + .encrypt = essiv_cbc_encrypt, + .decrypt = essiv_cbc_decrypt, + .init = essiv_cbc_init_tfm, + .exit = essiv_cbc_exit_tfm, +} }; + +static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); + + return aes_expandkey(&ctx->key, in_key, key_len); +} + +static void cmac_gf128_mul_by_x(be128 *y, const be128 *x) +{ + u64 a = be64_to_cpu(x->a); + u64 b = be64_to_cpu(x->b); + + y->a = cpu_to_be64((a << 1) | (b >> 63)); + y->b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0)); +} + +static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); + be128 *consts = (be128 *)ctx->consts; + int rounds = 6 + key_len / 4; + int err; + + err = cbcmac_setkey(tfm, in_key, key_len); + if (err) + return err; + + /* encrypt the zero vector */ + kernel_neon_begin(); + aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, ctx->key.key_enc, + rounds, 1); + kernel_neon_end(); + + cmac_gf128_mul_by_x(consts, consts); + cmac_gf128_mul_by_x(consts + 1, consts); + + return 0; +} + +static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key, + unsigned int key_len) +{ + static u8 const ks[3][AES_BLOCK_SIZE] = { + { [0 ... AES_BLOCK_SIZE - 1] = 0x1 }, + { [0 ... AES_BLOCK_SIZE - 1] = 0x2 }, + { [0 ... AES_BLOCK_SIZE - 1] = 0x3 }, + }; + + struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); + int rounds = 6 + key_len / 4; + u8 key[AES_BLOCK_SIZE]; + int err; + + err = cbcmac_setkey(tfm, in_key, key_len); + if (err) + return err; + + kernel_neon_begin(); + aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1); + aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2); + kernel_neon_end(); + + return cbcmac_setkey(tfm, key, sizeof(key)); +} + +static int mac_init(struct shash_desc *desc) +{ + struct mac_desc_ctx *ctx = shash_desc_ctx(desc); + + memset(ctx->dg, 0, AES_BLOCK_SIZE); + ctx->len = 0; + + return 0; +} + +static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks, + u8 dg[], int enc_before, int enc_after) +{ + int rounds = 6 + ctx->key_length / 4; + + if (crypto_simd_usable()) { + int rem; + + do { + kernel_neon_begin(); + rem = aes_mac_update(in, ctx->key_enc, rounds, blocks, + dg, enc_before, enc_after); + kernel_neon_end(); + in += (blocks - rem) * AES_BLOCK_SIZE; + blocks = rem; + enc_before = 0; + } while (blocks); + } else { + if (enc_before) + aes_encrypt(ctx, dg, dg); + + while (blocks--) { + crypto_xor(dg, in, AES_BLOCK_SIZE); + in += AES_BLOCK_SIZE; + + if (blocks || enc_after) + aes_encrypt(ctx, dg, dg); + } + } +} + +static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) +{ + struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct mac_desc_ctx *ctx = shash_desc_ctx(desc); + + while (len > 0) { + unsigned int l; + + if ((ctx->len % AES_BLOCK_SIZE) == 0 && + (ctx->len + len) > AES_BLOCK_SIZE) { + + int blocks = len / AES_BLOCK_SIZE; + + len %= AES_BLOCK_SIZE; + + mac_do_update(&tctx->key, p, blocks, ctx->dg, + (ctx->len != 0), (len != 0)); + + p += blocks * AES_BLOCK_SIZE; + + if (!len) { + ctx->len = AES_BLOCK_SIZE; + break; + } + ctx->len = 0; + } + + l = min(len, AES_BLOCK_SIZE - ctx->len); + + if (l <= AES_BLOCK_SIZE) { + crypto_xor(ctx->dg + ctx->len, p, l); + ctx->len += l; + len -= l; + p += l; + } + } + + return 0; +} + +static int cbcmac_final(struct shash_desc *desc, u8 *out) +{ + struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct mac_desc_ctx *ctx = shash_desc_ctx(desc); + + mac_do_update(&tctx->key, NULL, 0, ctx->dg, (ctx->len != 0), 0); + + memcpy(out, ctx->dg, AES_BLOCK_SIZE); + + return 0; +} + +static int cmac_final(struct shash_desc *desc, u8 *out) +{ + struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct mac_desc_ctx *ctx = shash_desc_ctx(desc); + u8 *consts = tctx->consts; + + if (ctx->len != AES_BLOCK_SIZE) { + ctx->dg[ctx->len] ^= 0x80; + consts += AES_BLOCK_SIZE; + } + + mac_do_update(&tctx->key, consts, 1, ctx->dg, 0, 1); + + memcpy(out, ctx->dg, AES_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg mac_algs[] = { { + .base.cra_name = "cmac(aes)", + .base.cra_driver_name = "cmac-aes-" MODE, + .base.cra_priority = PRIO, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + + 2 * AES_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = AES_BLOCK_SIZE, + .init = mac_init, + .update = mac_update, + .final = cmac_final, + .setkey = cmac_setkey, + .descsize = sizeof(struct mac_desc_ctx), +}, { + .base.cra_name = "xcbc(aes)", + .base.cra_driver_name = "xcbc-aes-" MODE, + .base.cra_priority = PRIO, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + + 2 * AES_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = AES_BLOCK_SIZE, + .init = mac_init, + .update = mac_update, + .final = cmac_final, + .setkey = xcbc_setkey, + .descsize = sizeof(struct mac_desc_ctx), +}, { + .base.cra_name = "cbcmac(aes)", + .base.cra_driver_name = "cbcmac-aes-" MODE, + .base.cra_priority = PRIO, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct mac_tfm_ctx), + .base.cra_module = THIS_MODULE, + + .digestsize = AES_BLOCK_SIZE, + .init = mac_init, + .update = mac_update, + .final = cbcmac_final, + .setkey = cbcmac_setkey, + .descsize = sizeof(struct mac_desc_ctx), +} }; + +static void aes_exit(void) +{ + crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs)); + crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); +} + +static int __init aes_init(void) +{ + int err; + + err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); + if (err) + return err; + + err = crypto_register_shashes(mac_algs, ARRAY_SIZE(mac_algs)); + if (err) + goto unregister_ciphers; + + return 0; + +unregister_ciphers: + crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); + return err; +} + +#ifdef USE_V8_CRYPTO_EXTENSIONS +module_cpu_feature_match(AES, aes_init); +#else +module_init(aes_init); +EXPORT_SYMBOL(neon_aes_ecb_encrypt); +EXPORT_SYMBOL(neon_aes_cbc_encrypt); +EXPORT_SYMBOL(neon_aes_ctr_encrypt); +EXPORT_SYMBOL(neon_aes_xts_encrypt); +EXPORT_SYMBOL(neon_aes_xts_decrypt); +#endif +module_exit(aes_exit); diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S new file mode 100644 index 000000000..5abc83427 --- /dev/null +++ b/arch/arm64/crypto/aes-modes.S @@ -0,0 +1,876 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES + * + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +/* included by aes-ce.S and aes-neon.S */ + + .text + .align 4 + +#ifndef MAX_STRIDE +#define MAX_STRIDE 4 +#endif + +#if MAX_STRIDE == 4 +#define ST4(x...) x +#define ST5(x...) +#else +#define ST4(x...) +#define ST5(x...) x +#endif + +SYM_FUNC_START_LOCAL(aes_encrypt_block4x) + encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 + ret +SYM_FUNC_END(aes_encrypt_block4x) + +SYM_FUNC_START_LOCAL(aes_decrypt_block4x) + decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 + ret +SYM_FUNC_END(aes_decrypt_block4x) + +#if MAX_STRIDE == 5 +SYM_FUNC_START_LOCAL(aes_encrypt_block5x) + encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 + ret +SYM_FUNC_END(aes_encrypt_block5x) + +SYM_FUNC_START_LOCAL(aes_decrypt_block5x) + decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 + ret +SYM_FUNC_END(aes_decrypt_block5x) +#endif + + /* + * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + */ + +AES_FUNC_START(aes_ecb_encrypt) + stp x29, x30, [sp, #-16]! + mov x29, sp + + enc_prepare w3, x2, x5 + +.LecbencloopNx: + subs w4, w4, #MAX_STRIDE + bmi .Lecbenc1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ +ST4( bl aes_encrypt_block4x ) +ST5( ld1 {v4.16b}, [x1], #16 ) +ST5( bl aes_encrypt_block5x ) + st1 {v0.16b-v3.16b}, [x0], #64 +ST5( st1 {v4.16b}, [x0], #16 ) + b .LecbencloopNx +.Lecbenc1x: + adds w4, w4, #MAX_STRIDE + beq .Lecbencout +.Lecbencloop: + ld1 {v0.16b}, [x1], #16 /* get next pt block */ + encrypt_block v0, w3, x2, x5, w6 + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lecbencloop +.Lecbencout: + ldp x29, x30, [sp], #16 + ret +AES_FUNC_END(aes_ecb_encrypt) + + +AES_FUNC_START(aes_ecb_decrypt) + stp x29, x30, [sp, #-16]! + mov x29, sp + + dec_prepare w3, x2, x5 + +.LecbdecloopNx: + subs w4, w4, #MAX_STRIDE + bmi .Lecbdec1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ +ST4( bl aes_decrypt_block4x ) +ST5( ld1 {v4.16b}, [x1], #16 ) +ST5( bl aes_decrypt_block5x ) + st1 {v0.16b-v3.16b}, [x0], #64 +ST5( st1 {v4.16b}, [x0], #16 ) + b .LecbdecloopNx +.Lecbdec1x: + adds w4, w4, #MAX_STRIDE + beq .Lecbdecout +.Lecbdecloop: + ld1 {v0.16b}, [x1], #16 /* get next ct block */ + decrypt_block v0, w3, x2, x5, w6 + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lecbdecloop +.Lecbdecout: + ldp x29, x30, [sp], #16 + ret +AES_FUNC_END(aes_ecb_decrypt) + + + /* + * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], + * int rounds, int blocks, u8 iv[], + * u32 const rk2[]); + * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], + * int rounds, int blocks, u8 iv[], + * u32 const rk2[]); + */ + +AES_FUNC_START(aes_essiv_cbc_encrypt) + ld1 {v4.16b}, [x5] /* get iv */ + + mov w8, #14 /* AES-256: 14 rounds */ + enc_prepare w8, x6, x7 + encrypt_block v4, w8, x6, x7, w9 + enc_switch_key w3, x2, x6 + b .Lcbcencloop4x + +AES_FUNC_START(aes_cbc_encrypt) + ld1 {v4.16b}, [x5] /* get iv */ + enc_prepare w3, x2, x6 + +.Lcbcencloop4x: + subs w4, w4, #4 + bmi .Lcbcenc1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ + eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ + encrypt_block v0, w3, x2, x6, w7 + eor v1.16b, v1.16b, v0.16b + encrypt_block v1, w3, x2, x6, w7 + eor v2.16b, v2.16b, v1.16b + encrypt_block v2, w3, x2, x6, w7 + eor v3.16b, v3.16b, v2.16b + encrypt_block v3, w3, x2, x6, w7 + st1 {v0.16b-v3.16b}, [x0], #64 + mov v4.16b, v3.16b + b .Lcbcencloop4x +.Lcbcenc1x: + adds w4, w4, #4 + beq .Lcbcencout +.Lcbcencloop: + ld1 {v0.16b}, [x1], #16 /* get next pt block */ + eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ + encrypt_block v4, w3, x2, x6, w7 + st1 {v4.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lcbcencloop +.Lcbcencout: + st1 {v4.16b}, [x5] /* return iv */ + ret +AES_FUNC_END(aes_cbc_encrypt) +AES_FUNC_END(aes_essiv_cbc_encrypt) + +AES_FUNC_START(aes_essiv_cbc_decrypt) + stp x29, x30, [sp, #-16]! + mov x29, sp + + ld1 {cbciv.16b}, [x5] /* get iv */ + + mov w8, #14 /* AES-256: 14 rounds */ + enc_prepare w8, x6, x7 + encrypt_block cbciv, w8, x6, x7, w9 + b .Lessivcbcdecstart + +AES_FUNC_START(aes_cbc_decrypt) + stp x29, x30, [sp, #-16]! + mov x29, sp + + ld1 {cbciv.16b}, [x5] /* get iv */ +.Lessivcbcdecstart: + dec_prepare w3, x2, x6 + +.LcbcdecloopNx: + subs w4, w4, #MAX_STRIDE + bmi .Lcbcdec1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ +#if MAX_STRIDE == 5 + ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ + mov v5.16b, v0.16b + mov v6.16b, v1.16b + mov v7.16b, v2.16b + bl aes_decrypt_block5x + sub x1, x1, #32 + eor v0.16b, v0.16b, cbciv.16b + eor v1.16b, v1.16b, v5.16b + ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + eor v4.16b, v4.16b, v5.16b +#else + mov v4.16b, v0.16b + mov v5.16b, v1.16b + mov v6.16b, v2.16b + bl aes_decrypt_block4x + sub x1, x1, #16 + eor v0.16b, v0.16b, cbciv.16b + eor v1.16b, v1.16b, v4.16b + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ + eor v2.16b, v2.16b, v5.16b + eor v3.16b, v3.16b, v6.16b +#endif + st1 {v0.16b-v3.16b}, [x0], #64 +ST5( st1 {v4.16b}, [x0], #16 ) + b .LcbcdecloopNx +.Lcbcdec1x: + adds w4, w4, #MAX_STRIDE + beq .Lcbcdecout +.Lcbcdecloop: + ld1 {v1.16b}, [x1], #16 /* get next ct block */ + mov v0.16b, v1.16b /* ...and copy to v0 */ + decrypt_block v0, w3, x2, x6, w7 + eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ + mov cbciv.16b, v1.16b /* ct is next iv */ + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lcbcdecloop +.Lcbcdecout: + st1 {cbciv.16b}, [x5] /* return iv */ + ldp x29, x30, [sp], #16 + ret +AES_FUNC_END(aes_cbc_decrypt) +AES_FUNC_END(aes_essiv_cbc_decrypt) + + + /* + * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], + * int rounds, int bytes, u8 const iv[]) + * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], + * int rounds, int bytes, u8 const iv[]) + */ + +AES_FUNC_START(aes_cbc_cts_encrypt) + adr_l x8, .Lcts_permute_table + sub x4, x4, #16 + add x9, x8, #32 + add x8, x8, x4 + sub x9, x9, x4 + ld1 {v3.16b}, [x8] + ld1 {v4.16b}, [x9] + + ld1 {v0.16b}, [x1], x4 /* overlapping loads */ + ld1 {v1.16b}, [x1] + + ld1 {v5.16b}, [x5] /* get iv */ + enc_prepare w3, x2, x6 + + eor v0.16b, v0.16b, v5.16b /* xor with iv */ + tbl v1.16b, {v1.16b}, v4.16b + encrypt_block v0, w3, x2, x6, w7 + + eor v1.16b, v1.16b, v0.16b + tbl v0.16b, {v0.16b}, v3.16b + encrypt_block v1, w3, x2, x6, w7 + + add x4, x0, x4 + st1 {v0.16b}, [x4] /* overlapping stores */ + st1 {v1.16b}, [x0] + ret +AES_FUNC_END(aes_cbc_cts_encrypt) + +AES_FUNC_START(aes_cbc_cts_decrypt) + adr_l x8, .Lcts_permute_table + sub x4, x4, #16 + add x9, x8, #32 + add x8, x8, x4 + sub x9, x9, x4 + ld1 {v3.16b}, [x8] + ld1 {v4.16b}, [x9] + + ld1 {v0.16b}, [x1], x4 /* overlapping loads */ + ld1 {v1.16b}, [x1] + + ld1 {v5.16b}, [x5] /* get iv */ + dec_prepare w3, x2, x6 + + decrypt_block v0, w3, x2, x6, w7 + tbl v2.16b, {v0.16b}, v3.16b + eor v2.16b, v2.16b, v1.16b + + tbx v0.16b, {v1.16b}, v4.16b + decrypt_block v0, w3, x2, x6, w7 + eor v0.16b, v0.16b, v5.16b /* xor with iv */ + + add x4, x0, x4 + st1 {v2.16b}, [x4] /* overlapping stores */ + st1 {v0.16b}, [x0] + ret +AES_FUNC_END(aes_cbc_cts_decrypt) + + .section ".rodata", "a" + .align 6 +.Lcts_permute_table: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .previous + + /* + * This macro generates the code for CTR and XCTR mode. + */ +.macro ctr_encrypt xctr + // Arguments + OUT .req x0 + IN .req x1 + KEY .req x2 + ROUNDS_W .req w3 + BYTES_W .req w4 + IV .req x5 + BYTE_CTR_W .req w6 // XCTR only + // Intermediate values + CTR_W .req w11 // XCTR only + CTR .req x11 // XCTR only + IV_PART .req x12 + BLOCKS .req x13 + BLOCKS_W .req w13 + + stp x29, x30, [sp, #-16]! + mov x29, sp + + enc_prepare ROUNDS_W, KEY, IV_PART + ld1 {vctr.16b}, [IV] + + /* + * Keep 64 bits of the IV in a register. For CTR mode this lets us + * easily increment the IV. For XCTR mode this lets us efficiently XOR + * the 64-bit counter with the IV. + */ + .if \xctr + umov IV_PART, vctr.d[0] + lsr CTR_W, BYTE_CTR_W, #4 + .else + umov IV_PART, vctr.d[1] + rev IV_PART, IV_PART + .endif + +.LctrloopNx\xctr: + add BLOCKS_W, BYTES_W, #15 + sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 + lsr BLOCKS_W, BLOCKS_W, #4 + mov w8, #MAX_STRIDE + cmp BLOCKS_W, w8 + csel BLOCKS_W, BLOCKS_W, w8, lt + + /* + * Set up the counter values in v0-v{MAX_STRIDE-1}. + * + * If we are encrypting less than MAX_STRIDE blocks, the tail block + * handling code expects the last keystream block to be in + * v{MAX_STRIDE-1}. For example: if encrypting two blocks with + * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. + */ + .if \xctr + add CTR, CTR, BLOCKS + .else + adds IV_PART, IV_PART, BLOCKS + .endif + mov v0.16b, vctr.16b + mov v1.16b, vctr.16b + mov v2.16b, vctr.16b + mov v3.16b, vctr.16b +ST5( mov v4.16b, vctr.16b ) + .if \xctr + sub x6, CTR, #MAX_STRIDE - 1 + sub x7, CTR, #MAX_STRIDE - 2 + sub x8, CTR, #MAX_STRIDE - 3 + sub x9, CTR, #MAX_STRIDE - 4 +ST5( sub x10, CTR, #MAX_STRIDE - 5 ) + eor x6, x6, IV_PART + eor x7, x7, IV_PART + eor x8, x8, IV_PART + eor x9, x9, IV_PART +ST5( eor x10, x10, IV_PART ) + mov v0.d[0], x6 + mov v1.d[0], x7 + mov v2.d[0], x8 + mov v3.d[0], x9 +ST5( mov v4.d[0], x10 ) + .else + bcs 0f + .subsection 1 + /* + * This subsection handles carries. + * + * Conditional branching here is allowed with respect to time + * invariance since the branches are dependent on the IV instead + * of the plaintext or key. This code is rarely executed in + * practice anyway. + */ + + /* Apply carry to outgoing counter. */ +0: umov x8, vctr.d[0] + rev x8, x8 + add x8, x8, #1 + rev x8, x8 + ins vctr.d[0], x8 + + /* + * Apply carry to counter blocks if needed. + * + * Since the carry flag was set, we know 0 <= IV_PART < + * MAX_STRIDE. Using the value of IV_PART we can determine how + * many counter blocks need to be updated. + */ + cbz IV_PART, 2f + adr x16, 1f + sub x16, x16, IV_PART, lsl #3 + br x16 + bti c + mov v0.d[0], vctr.d[0] + bti c + mov v1.d[0], vctr.d[0] + bti c + mov v2.d[0], vctr.d[0] + bti c + mov v3.d[0], vctr.d[0] +ST5( bti c ) +ST5( mov v4.d[0], vctr.d[0] ) +1: b 2f + .previous + +2: rev x7, IV_PART + ins vctr.d[1], x7 + sub x7, IV_PART, #MAX_STRIDE - 1 + sub x8, IV_PART, #MAX_STRIDE - 2 + sub x9, IV_PART, #MAX_STRIDE - 3 + rev x7, x7 + rev x8, x8 + mov v1.d[1], x7 + rev x9, x9 +ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) + mov v2.d[1], x8 +ST5( rev x10, x10 ) + mov v3.d[1], x9 +ST5( mov v4.d[1], x10 ) + .endif + + /* + * If there are at least MAX_STRIDE blocks left, XOR the data with + * keystream and store. Otherwise jump to tail handling. + */ + tbnz BYTES_W, #31, .Lctrtail\xctr + ld1 {v5.16b-v7.16b}, [IN], #48 +ST4( bl aes_encrypt_block4x ) +ST5( bl aes_encrypt_block5x ) + eor v0.16b, v5.16b, v0.16b +ST4( ld1 {v5.16b}, [IN], #16 ) + eor v1.16b, v6.16b, v1.16b +ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) + eor v2.16b, v7.16b, v2.16b + eor v3.16b, v5.16b, v3.16b +ST5( eor v4.16b, v6.16b, v4.16b ) + st1 {v0.16b-v3.16b}, [OUT], #64 +ST5( st1 {v4.16b}, [OUT], #16 ) + cbz BYTES_W, .Lctrout\xctr + b .LctrloopNx\xctr + +.Lctrout\xctr: + .if !\xctr + st1 {vctr.16b}, [IV] /* return next CTR value */ + .endif + ldp x29, x30, [sp], #16 + ret + +.Lctrtail\xctr: + /* + * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext + * + * This code expects the last keystream block to be in v{MAX_STRIDE-1}. + * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and + * v4 should have the next two counter blocks. + * + * This allows us to store the ciphertext by writing to overlapping + * regions of memory. Any invalid ciphertext blocks get overwritten by + * correctly computed blocks. This approach greatly simplifies the + * logic for storing the ciphertext. + */ + mov x16, #16 + ands w7, BYTES_W, #0xf + csel x13, x7, x16, ne + +ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) +ST5( csel x14, x16, xzr, gt ) + cmp BYTES_W, #48 - (MAX_STRIDE << 4) + csel x15, x16, xzr, gt + cmp BYTES_W, #32 - (MAX_STRIDE << 4) + csel x16, x16, xzr, gt + cmp BYTES_W, #16 - (MAX_STRIDE << 4) + + adr_l x9, .Lcts_permute_table + add x9, x9, x13 + ble .Lctrtail1x\xctr + +ST5( ld1 {v5.16b}, [IN], x14 ) + ld1 {v6.16b}, [IN], x15 + ld1 {v7.16b}, [IN], x16 + +ST4( bl aes_encrypt_block4x ) +ST5( bl aes_encrypt_block5x ) + + ld1 {v8.16b}, [IN], x13 + ld1 {v9.16b}, [IN] + ld1 {v10.16b}, [x9] + +ST4( eor v6.16b, v6.16b, v0.16b ) +ST4( eor v7.16b, v7.16b, v1.16b ) +ST4( tbl v3.16b, {v3.16b}, v10.16b ) +ST4( eor v8.16b, v8.16b, v2.16b ) +ST4( eor v9.16b, v9.16b, v3.16b ) + +ST5( eor v5.16b, v5.16b, v0.16b ) +ST5( eor v6.16b, v6.16b, v1.16b ) +ST5( tbl v4.16b, {v4.16b}, v10.16b ) +ST5( eor v7.16b, v7.16b, v2.16b ) +ST5( eor v8.16b, v8.16b, v3.16b ) +ST5( eor v9.16b, v9.16b, v4.16b ) + +ST5( st1 {v5.16b}, [OUT], x14 ) + st1 {v6.16b}, [OUT], x15 + st1 {v7.16b}, [OUT], x16 + add x13, x13, OUT + st1 {v9.16b}, [x13] // overlapping stores + st1 {v8.16b}, [OUT] + b .Lctrout\xctr + +.Lctrtail1x\xctr: + /* + * Handle <= 16 bytes of plaintext + * + * This code always reads and writes 16 bytes. To avoid out of bounds + * accesses, XCTR and CTR modes must use a temporary buffer when + * encrypting/decrypting less than 16 bytes. + * + * This code is unusual in that it loads the input and stores the output + * relative to the end of the buffers rather than relative to the start. + * This causes unusual behaviour when encrypting/decrypting less than 16 + * bytes; the end of the data is expected to be at the end of the + * temporary buffer rather than the start of the data being at the start + * of the temporary buffer. + */ + sub x8, x7, #16 + csel x7, x7, x8, eq + add IN, IN, x7 + add OUT, OUT, x7 + ld1 {v5.16b}, [IN] + ld1 {v6.16b}, [OUT] +ST5( mov v3.16b, v4.16b ) + encrypt_block v3, ROUNDS_W, KEY, x8, w7 + ld1 {v10.16b-v11.16b}, [x9] + tbl v3.16b, {v3.16b}, v10.16b + sshr v11.16b, v11.16b, #7 + eor v5.16b, v5.16b, v3.16b + bif v5.16b, v6.16b, v11.16b + st1 {v5.16b}, [OUT] + b .Lctrout\xctr + + // Arguments + .unreq OUT + .unreq IN + .unreq KEY + .unreq ROUNDS_W + .unreq BYTES_W + .unreq IV + .unreq BYTE_CTR_W // XCTR only + // Intermediate values + .unreq CTR_W // XCTR only + .unreq CTR // XCTR only + .unreq IV_PART + .unreq BLOCKS + .unreq BLOCKS_W +.endm + + /* + * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 ctr[]) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ + +AES_FUNC_START(aes_ctr_encrypt) + ctr_encrypt 0 +AES_FUNC_END(aes_ctr_encrypt) + + /* + * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 const iv[], int byte_ctr) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ + +AES_FUNC_START(aes_xctr_encrypt) + ctr_encrypt 1 +AES_FUNC_END(aes_xctr_encrypt) + + + /* + * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, + * int bytes, u8 const rk2[], u8 iv[], int first) + * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, + * int bytes, u8 const rk2[], u8 iv[], int first) + */ + + .macro next_tweak, out, in, tmp + sshr \tmp\().2d, \in\().2d, #63 + and \tmp\().16b, \tmp\().16b, xtsmask.16b + add \out\().2d, \in\().2d, \in\().2d + ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 + eor \out\().16b, \out\().16b, \tmp\().16b + .endm + + .macro xts_load_mask, tmp + movi xtsmask.2s, #0x1 + movi \tmp\().2s, #0x87 + uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s + .endm + +AES_FUNC_START(aes_xts_encrypt) + stp x29, x30, [sp, #-16]! + mov x29, sp + + ld1 {v4.16b}, [x6] + xts_load_mask v8 + cbz w7, .Lxtsencnotfirst + + enc_prepare w3, x5, x8 + xts_cts_skip_tw w7, .LxtsencNx + encrypt_block v4, w3, x5, x8, w7 /* first tweak */ + enc_switch_key w3, x2, x8 + b .LxtsencNx + +.Lxtsencnotfirst: + enc_prepare w3, x2, x8 +.LxtsencloopNx: + next_tweak v4, v4, v8 +.LxtsencNx: + subs w4, w4, #64 + bmi .Lxtsenc1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ + next_tweak v5, v4, v8 + eor v0.16b, v0.16b, v4.16b + next_tweak v6, v5, v8 + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + next_tweak v7, v6, v8 + eor v3.16b, v3.16b, v7.16b + bl aes_encrypt_block4x + eor v3.16b, v3.16b, v7.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + st1 {v0.16b-v3.16b}, [x0], #64 + mov v4.16b, v7.16b + cbz w4, .Lxtsencret + xts_reload_mask v8 + b .LxtsencloopNx +.Lxtsenc1x: + adds w4, w4, #64 + beq .Lxtsencout + subs w4, w4, #16 + bmi .LxtsencctsNx +.Lxtsencloop: + ld1 {v0.16b}, [x1], #16 +.Lxtsencctsout: + eor v0.16b, v0.16b, v4.16b + encrypt_block v0, w3, x2, x8, w7 + eor v0.16b, v0.16b, v4.16b + cbz w4, .Lxtsencout + subs w4, w4, #16 + next_tweak v4, v4, v8 + bmi .Lxtsenccts + st1 {v0.16b}, [x0], #16 + b .Lxtsencloop +.Lxtsencout: + st1 {v0.16b}, [x0] +.Lxtsencret: + st1 {v4.16b}, [x6] + ldp x29, x30, [sp], #16 + ret + +.LxtsencctsNx: + mov v0.16b, v3.16b + sub x0, x0, #16 +.Lxtsenccts: + adr_l x8, .Lcts_permute_table + + add x1, x1, w4, sxtw /* rewind input pointer */ + add w4, w4, #16 /* # bytes in final block */ + add x9, x8, #32 + add x8, x8, x4 + sub x9, x9, x4 + add x4, x0, x4 /* output address of final block */ + + ld1 {v1.16b}, [x1] /* load final block */ + ld1 {v2.16b}, [x8] + ld1 {v3.16b}, [x9] + + tbl v2.16b, {v0.16b}, v2.16b + tbx v0.16b, {v1.16b}, v3.16b + st1 {v2.16b}, [x4] /* overlapping stores */ + mov w4, wzr + b .Lxtsencctsout +AES_FUNC_END(aes_xts_encrypt) + +AES_FUNC_START(aes_xts_decrypt) + stp x29, x30, [sp, #-16]! + mov x29, sp + + /* subtract 16 bytes if we are doing CTS */ + sub w8, w4, #0x10 + tst w4, #0xf + csel w4, w4, w8, eq + + ld1 {v4.16b}, [x6] + xts_load_mask v8 + xts_cts_skip_tw w7, .Lxtsdecskiptw + cbz w7, .Lxtsdecnotfirst + + enc_prepare w3, x5, x8 + encrypt_block v4, w3, x5, x8, w7 /* first tweak */ +.Lxtsdecskiptw: + dec_prepare w3, x2, x8 + b .LxtsdecNx + +.Lxtsdecnotfirst: + dec_prepare w3, x2, x8 +.LxtsdecloopNx: + next_tweak v4, v4, v8 +.LxtsdecNx: + subs w4, w4, #64 + bmi .Lxtsdec1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ + next_tweak v5, v4, v8 + eor v0.16b, v0.16b, v4.16b + next_tweak v6, v5, v8 + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + next_tweak v7, v6, v8 + eor v3.16b, v3.16b, v7.16b + bl aes_decrypt_block4x + eor v3.16b, v3.16b, v7.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + st1 {v0.16b-v3.16b}, [x0], #64 + mov v4.16b, v7.16b + cbz w4, .Lxtsdecout + xts_reload_mask v8 + b .LxtsdecloopNx +.Lxtsdec1x: + adds w4, w4, #64 + beq .Lxtsdecout + subs w4, w4, #16 +.Lxtsdecloop: + ld1 {v0.16b}, [x1], #16 + bmi .Lxtsdeccts +.Lxtsdecctsout: + eor v0.16b, v0.16b, v4.16b + decrypt_block v0, w3, x2, x8, w7 + eor v0.16b, v0.16b, v4.16b + st1 {v0.16b}, [x0], #16 + cbz w4, .Lxtsdecout + subs w4, w4, #16 + next_tweak v4, v4, v8 + b .Lxtsdecloop +.Lxtsdecout: + st1 {v4.16b}, [x6] + ldp x29, x30, [sp], #16 + ret + +.Lxtsdeccts: + adr_l x8, .Lcts_permute_table + + add x1, x1, w4, sxtw /* rewind input pointer */ + add w4, w4, #16 /* # bytes in final block */ + add x9, x8, #32 + add x8, x8, x4 + sub x9, x9, x4 + add x4, x0, x4 /* output address of final block */ + + next_tweak v5, v4, v8 + + ld1 {v1.16b}, [x1] /* load final block */ + ld1 {v2.16b}, [x8] + ld1 {v3.16b}, [x9] + + eor v0.16b, v0.16b, v5.16b + decrypt_block v0, w3, x2, x8, w7 + eor v0.16b, v0.16b, v5.16b + + tbl v2.16b, {v0.16b}, v2.16b + tbx v0.16b, {v1.16b}, v3.16b + + st1 {v2.16b}, [x4] /* overlapping stores */ + mov w4, wzr + b .Lxtsdecctsout +AES_FUNC_END(aes_xts_decrypt) + + /* + * aes_mac_update(u8 const in[], u32 const rk[], int rounds, + * int blocks, u8 dg[], int enc_before, int enc_after) + */ +AES_FUNC_START(aes_mac_update) + ld1 {v0.16b}, [x4] /* get dg */ + enc_prepare w2, x1, x7 + cbz w5, .Lmacloop4x + + encrypt_block v0, w2, x1, x7, w8 + +.Lmacloop4x: + subs w3, w3, #4 + bmi .Lmac1x + ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ + eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ + encrypt_block v0, w2, x1, x7, w8 + eor v0.16b, v0.16b, v2.16b + encrypt_block v0, w2, x1, x7, w8 + eor v0.16b, v0.16b, v3.16b + encrypt_block v0, w2, x1, x7, w8 + eor v0.16b, v0.16b, v4.16b + cmp w3, wzr + csinv x5, x6, xzr, eq + cbz w5, .Lmacout + encrypt_block v0, w2, x1, x7, w8 + st1 {v0.16b}, [x4] /* return dg */ + cond_yield .Lmacout, x7, x8 + b .Lmacloop4x +.Lmac1x: + add w3, w3, #4 +.Lmacloop: + cbz w3, .Lmacout + ld1 {v1.16b}, [x0], #16 /* get next pt block */ + eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ + + subs w3, w3, #1 + csinv x5, x6, xzr, eq + cbz w5, .Lmacout + +.Lmacenc: + encrypt_block v0, w2, x1, x7, w8 + b .Lmacloop + +.Lmacout: + st1 {v0.16b}, [x4] /* return dg */ + mov w0, w3 + ret +AES_FUNC_END(aes_mac_update) diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S new file mode 100644 index 000000000..9de7fbc79 --- /dev/null +++ b/arch/arm64/crypto/aes-neon.S @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON + * + * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func) +#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func) + + xtsmask .req v7 + cbciv .req v7 + vctr .req v4 + + .macro xts_reload_mask, tmp + xts_load_mask \tmp + .endm + + /* special case for the neon-bs driver calling into this one for CTS */ + .macro xts_cts_skip_tw, reg, lbl + tbnz \reg, #1, \lbl + .endm + + /* multiply by polynomial 'x' in GF(2^8) */ + .macro mul_by_x, out, in, temp, const + sshr \temp, \in, #7 + shl \out, \in, #1 + and \temp, \temp, \const + eor \out, \out, \temp + .endm + + /* multiply by polynomial 'x^2' in GF(2^8) */ + .macro mul_by_x2, out, in, temp, const + ushr \temp, \in, #6 + shl \out, \in, #2 + pmul \temp, \temp, \const + eor \out, \out, \temp + .endm + + /* preload the entire Sbox */ + .macro prepare, sbox, shiftrows, temp + movi v12.16b, #0x1b + ldr_l q13, \shiftrows, \temp + ldr_l q14, .Lror32by8, \temp + adr_l \temp, \sbox + ld1 {v16.16b-v19.16b}, [\temp], #64 + ld1 {v20.16b-v23.16b}, [\temp], #64 + ld1 {v24.16b-v27.16b}, [\temp], #64 + ld1 {v28.16b-v31.16b}, [\temp] + .endm + + /* do preload for encryption */ + .macro enc_prepare, ignore0, ignore1, temp + prepare crypto_aes_sbox, .LForward_ShiftRows, \temp + .endm + + .macro enc_switch_key, ignore0, ignore1, temp + /* do nothing */ + .endm + + /* do preload for decryption */ + .macro dec_prepare, ignore0, ignore1, temp + prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp + .endm + + /* apply SubBytes transformation using the preloaded Sbox */ + .macro sub_bytes, in + sub v9.16b, \in\().16b, v15.16b + tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b + sub v10.16b, v9.16b, v15.16b + tbx \in\().16b, {v20.16b-v23.16b}, v9.16b + sub v11.16b, v10.16b, v15.16b + tbx \in\().16b, {v24.16b-v27.16b}, v10.16b + tbx \in\().16b, {v28.16b-v31.16b}, v11.16b + .endm + + /* apply MixColumns transformation */ + .macro mix_columns, in, enc + .if \enc == 0 + /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ + mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b + eor \in\().16b, \in\().16b, v8.16b + rev32 v8.8h, v8.8h + eor \in\().16b, \in\().16b, v8.16b + .endif + + mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b + rev32 v8.8h, \in\().8h + eor v8.16b, v8.16b, v9.16b + eor \in\().16b, \in\().16b, v8.16b + tbl \in\().16b, {\in\().16b}, v14.16b + eor \in\().16b, \in\().16b, v8.16b + .endm + + .macro do_block, enc, in, rounds, rk, rkp, i + ld1 {v15.4s}, [\rk] + add \rkp, \rk, #16 + mov \i, \rounds +1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ + movi v15.16b, #0x40 + tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ + sub_bytes \in + subs \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 + beq 2222f + mix_columns \in, \enc + b 1111b +2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ + .endm + + .macro encrypt_block, in, rounds, rk, rkp, i + do_block 1, \in, \rounds, \rk, \rkp, \i + .endm + + .macro decrypt_block, in, rounds, rk, rkp, i + do_block 0, \in, \rounds, \rk, \rkp, \i + .endm + + /* + * Interleaved versions: functionally equivalent to the + * ones above, but applied to AES states in parallel. + */ + + .macro sub_bytes_4x, in0, in1, in2, in3 + sub v8.16b, \in0\().16b, v15.16b + tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b + sub v9.16b, \in1\().16b, v15.16b + tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b + sub v10.16b, \in2\().16b, v15.16b + tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b + sub v11.16b, \in3\().16b, v15.16b + tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b + tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b + tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b + sub v8.16b, v8.16b, v15.16b + tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b + sub v9.16b, v9.16b, v15.16b + tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b + sub v10.16b, v10.16b, v15.16b + tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b + sub v11.16b, v11.16b, v15.16b + tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b + sub v8.16b, v8.16b, v15.16b + tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b + sub v9.16b, v9.16b, v15.16b + tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b + sub v10.16b, v10.16b, v15.16b + tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b + sub v11.16b, v11.16b, v15.16b + tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b + tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b + tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b + .endm + + .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const + sshr \tmp0\().16b, \in0\().16b, #7 + shl \out0\().16b, \in0\().16b, #1 + sshr \tmp1\().16b, \in1\().16b, #7 + and \tmp0\().16b, \tmp0\().16b, \const\().16b + shl \out1\().16b, \in1\().16b, #1 + and \tmp1\().16b, \tmp1\().16b, \const\().16b + eor \out0\().16b, \out0\().16b, \tmp0\().16b + eor \out1\().16b, \out1\().16b, \tmp1\().16b + .endm + + .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const + ushr \tmp0\().16b, \in0\().16b, #6 + shl \out0\().16b, \in0\().16b, #2 + ushr \tmp1\().16b, \in1\().16b, #6 + pmul \tmp0\().16b, \tmp0\().16b, \const\().16b + shl \out1\().16b, \in1\().16b, #2 + pmul \tmp1\().16b, \tmp1\().16b, \const\().16b + eor \out0\().16b, \out0\().16b, \tmp0\().16b + eor \out1\().16b, \out1\().16b, \tmp1\().16b + .endm + + .macro mix_columns_2x, in0, in1, enc + .if \enc == 0 + /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ + mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12 + eor \in0\().16b, \in0\().16b, v8.16b + rev32 v8.8h, v8.8h + eor \in1\().16b, \in1\().16b, v9.16b + rev32 v9.8h, v9.8h + eor \in0\().16b, \in0\().16b, v8.16b + eor \in1\().16b, \in1\().16b, v9.16b + .endif + + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12 + rev32 v10.8h, \in0\().8h + rev32 v11.8h, \in1\().8h + eor v10.16b, v10.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor \in0\().16b, \in0\().16b, v10.16b + eor \in1\().16b, \in1\().16b, v11.16b + tbl \in0\().16b, {\in0\().16b}, v14.16b + tbl \in1\().16b, {\in1\().16b}, v14.16b + eor \in0\().16b, \in0\().16b, v10.16b + eor \in1\().16b, \in1\().16b, v11.16b + .endm + + .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i + ld1 {v15.4s}, [\rk] + add \rkp, \rk, #16 + mov \i, \rounds +1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ + movi v15.16b, #0x40 + tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ + tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ + tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ + tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ + sub_bytes_4x \in0, \in1, \in2, \in3 + subs \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 + beq 2222f + mix_columns_2x \in0, \in1, \enc + mix_columns_2x \in2, \in3, \enc + b 1111b +2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ + .endm + + .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i + do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i + .endm + + .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i + do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i + .endm + +#include "aes-modes.S" + + .section ".rodata", "a" + .align 4 +.LForward_ShiftRows: + .octa 0x0b06010c07020d08030e09040f0a0500 + +.LReverse_ShiftRows: + .octa 0x0306090c0f0205080b0e0104070a0d00 + +.Lror32by8: + .octa 0x0c0f0e0d080b0a090407060500030201 diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S new file mode 100644 index 000000000..b2062eeee --- /dev/null +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -0,0 +1,868 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Bit sliced AES using NEON instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +/* + * The algorithm implemented here is described in detail by the paper + * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and + * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) + * + * This implementation is based primarily on the OpenSSL implementation + * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> + */ + +#include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <asm/assembler.h> + + .text + + rounds .req x11 + bskey .req x12 + + .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 + eor \b2, \b2, \b1 + eor \b5, \b5, \b6 + eor \b3, \b3, \b0 + eor \b6, \b6, \b2 + eor \b5, \b5, \b0 + eor \b6, \b6, \b3 + eor \b3, \b3, \b7 + eor \b7, \b7, \b5 + eor \b3, \b3, \b4 + eor \b4, \b4, \b5 + eor \b2, \b2, \b7 + eor \b3, \b3, \b1 + eor \b1, \b1, \b5 + .endm + + .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 + eor \b0, \b0, \b6 + eor \b1, \b1, \b4 + eor \b4, \b4, \b6 + eor \b2, \b2, \b0 + eor \b6, \b6, \b1 + eor \b1, \b1, \b5 + eor \b5, \b5, \b3 + eor \b3, \b3, \b7 + eor \b7, \b7, \b5 + eor \b2, \b2, \b5 + eor \b4, \b4, \b7 + .endm + + .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 + eor \b1, \b1, \b7 + eor \b4, \b4, \b7 + eor \b7, \b7, \b5 + eor \b1, \b1, \b3 + eor \b2, \b2, \b5 + eor \b3, \b3, \b7 + eor \b6, \b6, \b1 + eor \b2, \b2, \b0 + eor \b5, \b5, \b3 + eor \b4, \b4, \b6 + eor \b0, \b0, \b6 + eor \b1, \b1, \b4 + .endm + + .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 + eor \b1, \b1, \b5 + eor \b2, \b2, \b7 + eor \b3, \b3, \b1 + eor \b4, \b4, \b5 + eor \b7, \b7, \b5 + eor \b3, \b3, \b4 + eor \b5, \b5, \b0 + eor \b3, \b3, \b7 + eor \b6, \b6, \b2 + eor \b2, \b2, \b1 + eor \b6, \b6, \b3 + eor \b3, \b3, \b0 + eor \b5, \b5, \b6 + .endm + + .macro mul_gf4, x0, x1, y0, y1, t0, t1 + eor \t0, \y0, \y1 + and \t0, \t0, \x0 + eor \x0, \x0, \x1 + and \t1, \x1, \y0 + and \x0, \x0, \y1 + eor \x1, \t1, \t0 + eor \x0, \x0, \t1 + .endm + + .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 + eor \t0, \y0, \y1 + eor \t1, \y2, \y3 + and \t0, \t0, \x0 + and \t1, \t1, \x2 + eor \x0, \x0, \x1 + eor \x2, \x2, \x3 + and \x1, \x1, \y0 + and \x3, \x3, \y2 + and \x0, \x0, \y1 + and \x2, \x2, \y3 + eor \x1, \x1, \x0 + eor \x2, \x2, \x3 + eor \x0, \x0, \t0 + eor \x3, \x3, \t1 + .endm + + .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, y1, y2, y3, t0, t1, t2, t3 + eor \t0, \x0, \x2 + eor \t1, \x1, \x3 + mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 + eor \y0, \y0, \y2 + eor \y1, \y1, \y3 + mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 + eor \x0, \x0, \t0 + eor \x2, \x2, \t0 + eor \x1, \x1, \t1 + eor \x3, \x3, \t1 + eor \t0, \x4, \x6 + eor \t1, \x5, \x7 + mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 + eor \y0, \y0, \y2 + eor \y1, \y1, \y3 + mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 + eor \x4, \x4, \t0 + eor \x6, \x6, \t0 + eor \x5, \x5, \t1 + eor \x7, \x7, \t1 + .endm + + .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, s0, s1, s2, s3 + eor \t3, \x4, \x6 + eor \t0, \x5, \x7 + eor \t1, \x1, \x3 + eor \s1, \x7, \x6 + eor \s0, \x0, \x2 + eor \s3, \t3, \t0 + orr \t2, \t0, \t1 + and \s2, \t3, \s0 + orr \t3, \t3, \s0 + eor \s0, \s0, \t1 + and \t0, \t0, \t1 + eor \t1, \x3, \x2 + and \s3, \s3, \s0 + and \s1, \s1, \t1 + eor \t1, \x4, \x5 + eor \s0, \x1, \x0 + eor \t3, \t3, \s1 + eor \t2, \t2, \s1 + and \s1, \t1, \s0 + orr \t1, \t1, \s0 + eor \t3, \t3, \s3 + eor \t0, \t0, \s1 + eor \t2, \t2, \s2 + eor \t1, \t1, \s3 + eor \t0, \t0, \s2 + and \s0, \x7, \x3 + eor \t1, \t1, \s2 + and \s1, \x6, \x2 + and \s2, \x5, \x1 + orr \s3, \x4, \x0 + eor \t3, \t3, \s0 + eor \t1, \t1, \s2 + eor \s0, \t0, \s3 + eor \t2, \t2, \s1 + and \s2, \t3, \t1 + eor \s1, \t2, \s2 + eor \s3, \s0, \s2 + bsl \s1, \t1, \s0 + not \t0, \s0 + bsl \s0, \s1, \s3 + bsl \t0, \s1, \s3 + bsl \s3, \t3, \t2 + eor \t3, \t3, \t2 + and \s2, \s0, \s3 + eor \t1, \t1, \t0 + eor \s2, \s2, \t3 + mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ + \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 + .endm + + .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ + t0, t1, t2, t3, s0, s1, s2, s3 + in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ + \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b + inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ + \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ + \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ + \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b + out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ + \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b + .endm + + .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ + t0, t1, t2, t3, s0, s1, s2, s3 + inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ + \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b + inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ + \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ + \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ + \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b + inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ + \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b + .endm + + .macro enc_next_rk + ldp q16, q17, [bskey], #128 + ldp q18, q19, [bskey, #-96] + ldp q20, q21, [bskey, #-64] + ldp q22, q23, [bskey, #-32] + .endm + + .macro dec_next_rk + ldp q16, q17, [bskey, #-128]! + ldp q18, q19, [bskey, #32] + ldp q20, q21, [bskey, #64] + ldp q22, q23, [bskey, #96] + .endm + + .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 + eor \x0\().16b, \x0\().16b, v16.16b + eor \x1\().16b, \x1\().16b, v17.16b + eor \x2\().16b, \x2\().16b, v18.16b + eor \x3\().16b, \x3\().16b, v19.16b + eor \x4\().16b, \x4\().16b, v20.16b + eor \x5\().16b, \x5\().16b, v21.16b + eor \x6\().16b, \x6\().16b, v22.16b + eor \x7\().16b, \x7\().16b, v23.16b + .endm + + .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask + tbl \x0\().16b, {\x0\().16b}, \mask\().16b + tbl \x1\().16b, {\x1\().16b}, \mask\().16b + tbl \x2\().16b, {\x2\().16b}, \mask\().16b + tbl \x3\().16b, {\x3\().16b}, \mask\().16b + tbl \x4\().16b, {\x4\().16b}, \mask\().16b + tbl \x5\().16b, {\x5\().16b}, \mask\().16b + tbl \x6\().16b, {\x6\().16b}, \mask\().16b + tbl \x7\().16b, {\x7\().16b}, \mask\().16b + .endm + + .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, t4, t5, t6, t7, inv + ext \t0\().16b, \x0\().16b, \x0\().16b, #12 + ext \t1\().16b, \x1\().16b, \x1\().16b, #12 + eor \x0\().16b, \x0\().16b, \t0\().16b + ext \t2\().16b, \x2\().16b, \x2\().16b, #12 + eor \x1\().16b, \x1\().16b, \t1\().16b + ext \t3\().16b, \x3\().16b, \x3\().16b, #12 + eor \x2\().16b, \x2\().16b, \t2\().16b + ext \t4\().16b, \x4\().16b, \x4\().16b, #12 + eor \x3\().16b, \x3\().16b, \t3\().16b + ext \t5\().16b, \x5\().16b, \x5\().16b, #12 + eor \x4\().16b, \x4\().16b, \t4\().16b + ext \t6\().16b, \x6\().16b, \x6\().16b, #12 + eor \x5\().16b, \x5\().16b, \t5\().16b + ext \t7\().16b, \x7\().16b, \x7\().16b, #12 + eor \x6\().16b, \x6\().16b, \t6\().16b + eor \t1\().16b, \t1\().16b, \x0\().16b + eor \x7\().16b, \x7\().16b, \t7\().16b + ext \x0\().16b, \x0\().16b, \x0\().16b, #8 + eor \t2\().16b, \t2\().16b, \x1\().16b + eor \t0\().16b, \t0\().16b, \x7\().16b + eor \t1\().16b, \t1\().16b, \x7\().16b + ext \x1\().16b, \x1\().16b, \x1\().16b, #8 + eor \t5\().16b, \t5\().16b, \x4\().16b + eor \x0\().16b, \x0\().16b, \t0\().16b + eor \t6\().16b, \t6\().16b, \x5\().16b + eor \x1\().16b, \x1\().16b, \t1\().16b + ext \t0\().16b, \x4\().16b, \x4\().16b, #8 + eor \t4\().16b, \t4\().16b, \x3\().16b + ext \t1\().16b, \x5\().16b, \x5\().16b, #8 + eor \t7\().16b, \t7\().16b, \x6\().16b + ext \x4\().16b, \x3\().16b, \x3\().16b, #8 + eor \t3\().16b, \t3\().16b, \x2\().16b + ext \x5\().16b, \x7\().16b, \x7\().16b, #8 + eor \t4\().16b, \t4\().16b, \x7\().16b + ext \x3\().16b, \x6\().16b, \x6\().16b, #8 + eor \t3\().16b, \t3\().16b, \x7\().16b + ext \x6\().16b, \x2\().16b, \x2\().16b, #8 + eor \x7\().16b, \t1\().16b, \t5\().16b + .ifb \inv + eor \x2\().16b, \t0\().16b, \t4\().16b + eor \x4\().16b, \x4\().16b, \t3\().16b + eor \x5\().16b, \x5\().16b, \t7\().16b + eor \x3\().16b, \x3\().16b, \t6\().16b + eor \x6\().16b, \x6\().16b, \t2\().16b + .else + eor \t3\().16b, \t3\().16b, \x4\().16b + eor \x5\().16b, \x5\().16b, \t7\().16b + eor \x2\().16b, \x3\().16b, \t6\().16b + eor \x3\().16b, \t0\().16b, \t4\().16b + eor \x4\().16b, \x6\().16b, \t2\().16b + mov \x6\().16b, \t3\().16b + .endif + .endm + + .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, t4, t5, t6, t7 + ext \t0\().16b, \x0\().16b, \x0\().16b, #8 + ext \t6\().16b, \x6\().16b, \x6\().16b, #8 + ext \t7\().16b, \x7\().16b, \x7\().16b, #8 + eor \t0\().16b, \t0\().16b, \x0\().16b + ext \t1\().16b, \x1\().16b, \x1\().16b, #8 + eor \t6\().16b, \t6\().16b, \x6\().16b + ext \t2\().16b, \x2\().16b, \x2\().16b, #8 + eor \t7\().16b, \t7\().16b, \x7\().16b + ext \t3\().16b, \x3\().16b, \x3\().16b, #8 + eor \t1\().16b, \t1\().16b, \x1\().16b + ext \t4\().16b, \x4\().16b, \x4\().16b, #8 + eor \t2\().16b, \t2\().16b, \x2\().16b + ext \t5\().16b, \x5\().16b, \x5\().16b, #8 + eor \t3\().16b, \t3\().16b, \x3\().16b + eor \t4\().16b, \t4\().16b, \x4\().16b + eor \t5\().16b, \t5\().16b, \x5\().16b + eor \x0\().16b, \x0\().16b, \t6\().16b + eor \x1\().16b, \x1\().16b, \t6\().16b + eor \x2\().16b, \x2\().16b, \t0\().16b + eor \x4\().16b, \x4\().16b, \t2\().16b + eor \x3\().16b, \x3\().16b, \t1\().16b + eor \x1\().16b, \x1\().16b, \t7\().16b + eor \x2\().16b, \x2\().16b, \t7\().16b + eor \x4\().16b, \x4\().16b, \t6\().16b + eor \x5\().16b, \x5\().16b, \t3\().16b + eor \x3\().16b, \x3\().16b, \t6\().16b + eor \x6\().16b, \x6\().16b, \t4\().16b + eor \x4\().16b, \x4\().16b, \t7\().16b + eor \x5\().16b, \x5\().16b, \t7\().16b + eor \x7\().16b, \x7\().16b, \t5\().16b + mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ + \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 + .endm + + .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 + ushr \t0\().2d, \b0\().2d, #\n + ushr \t1\().2d, \b1\().2d, #\n + eor \t0\().16b, \t0\().16b, \a0\().16b + eor \t1\().16b, \t1\().16b, \a1\().16b + and \t0\().16b, \t0\().16b, \mask\().16b + and \t1\().16b, \t1\().16b, \mask\().16b + eor \a0\().16b, \a0\().16b, \t0\().16b + shl \t0\().2d, \t0\().2d, #\n + eor \a1\().16b, \a1\().16b, \t1\().16b + shl \t1\().2d, \t1\().2d, #\n + eor \b0\().16b, \b0\().16b, \t0\().16b + eor \b1\().16b, \b1\().16b, \t1\().16b + .endm + + .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 + movi \t0\().16b, #0x55 + movi \t1\().16b, #0x33 + swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 + swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 + movi \t0\().16b, #0x0f + swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 + swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 + swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 + swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 + .endm + + + .align 6 +M0: .octa 0x0004080c0105090d02060a0e03070b0f + +M0SR: .octa 0x0004080c05090d010a0e02060f03070b +SR: .octa 0x0f0e0d0c0a09080b0504070600030201 +SRM0: .octa 0x01060b0c0207080d0304090e00050a0f + +M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 +ISR: .octa 0x0f0e0d0c080b0a090504070602010003 +ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f + + /* + * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) + */ +SYM_FUNC_START(aesbs_convert_key) + ld1 {v7.4s}, [x1], #16 // load round 0 key + ld1 {v17.4s}, [x1], #16 // load round 1 key + + movi v8.16b, #0x01 // bit masks + movi v9.16b, #0x02 + movi v10.16b, #0x04 + movi v11.16b, #0x08 + movi v12.16b, #0x10 + movi v13.16b, #0x20 + movi v14.16b, #0x40 + movi v15.16b, #0x80 + ldr q16, M0 + + sub x2, x2, #1 + str q7, [x0], #16 // save round 0 key + +.Lkey_loop: + tbl v7.16b ,{v17.16b}, v16.16b + ld1 {v17.4s}, [x1], #16 // load next round key + + cmtst v0.16b, v7.16b, v8.16b + cmtst v1.16b, v7.16b, v9.16b + cmtst v2.16b, v7.16b, v10.16b + cmtst v3.16b, v7.16b, v11.16b + cmtst v4.16b, v7.16b, v12.16b + cmtst v5.16b, v7.16b, v13.16b + cmtst v6.16b, v7.16b, v14.16b + cmtst v7.16b, v7.16b, v15.16b + not v0.16b, v0.16b + not v1.16b, v1.16b + not v5.16b, v5.16b + not v6.16b, v6.16b + + subs x2, x2, #1 + stp q0, q1, [x0], #128 + stp q2, q3, [x0, #-96] + stp q4, q5, [x0, #-64] + stp q6, q7, [x0, #-32] + b.ne .Lkey_loop + + movi v7.16b, #0x63 // compose .L63 + eor v17.16b, v17.16b, v7.16b + str q17, [x0] + ret +SYM_FUNC_END(aesbs_convert_key) + + .align 4 +SYM_FUNC_START_LOCAL(aesbs_encrypt8) + ldr q9, [bskey], #16 // round 0 key + ldr q8, M0SR + ldr q24, SR + + eor v10.16b, v0.16b, v9.16b // xor with round0 key + eor v11.16b, v1.16b, v9.16b + tbl v0.16b, {v10.16b}, v8.16b + eor v12.16b, v2.16b, v9.16b + tbl v1.16b, {v11.16b}, v8.16b + eor v13.16b, v3.16b, v9.16b + tbl v2.16b, {v12.16b}, v8.16b + eor v14.16b, v4.16b, v9.16b + tbl v3.16b, {v13.16b}, v8.16b + eor v15.16b, v5.16b, v9.16b + tbl v4.16b, {v14.16b}, v8.16b + eor v10.16b, v6.16b, v9.16b + tbl v5.16b, {v15.16b}, v8.16b + eor v11.16b, v7.16b, v9.16b + tbl v6.16b, {v10.16b}, v8.16b + tbl v7.16b, {v11.16b}, v8.16b + + bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 + + sub rounds, rounds, #1 + b .Lenc_sbox + +.Lenc_loop: + shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 +.Lenc_sbox: + sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ + v13, v14, v15 + subs rounds, rounds, #1 + b.cc .Lenc_done + + enc_next_rk + + mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ + v13, v14, v15 + + add_round_key v0, v1, v2, v3, v4, v5, v6, v7 + + b.ne .Lenc_loop + ldr q24, SRM0 + b .Lenc_loop + +.Lenc_done: + ldr q12, [bskey] // last round key + + bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 + + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v12.16b + eor v6.16b, v6.16b, v12.16b + eor v3.16b, v3.16b, v12.16b + eor v7.16b, v7.16b, v12.16b + eor v2.16b, v2.16b, v12.16b + eor v5.16b, v5.16b, v12.16b + ret +SYM_FUNC_END(aesbs_encrypt8) + + .align 4 +SYM_FUNC_START_LOCAL(aesbs_decrypt8) + lsl x9, rounds, #7 + add bskey, bskey, x9 + + ldr q9, [bskey, #-112]! // round 0 key + ldr q8, M0ISR + ldr q24, ISR + + eor v10.16b, v0.16b, v9.16b // xor with round0 key + eor v11.16b, v1.16b, v9.16b + tbl v0.16b, {v10.16b}, v8.16b + eor v12.16b, v2.16b, v9.16b + tbl v1.16b, {v11.16b}, v8.16b + eor v13.16b, v3.16b, v9.16b + tbl v2.16b, {v12.16b}, v8.16b + eor v14.16b, v4.16b, v9.16b + tbl v3.16b, {v13.16b}, v8.16b + eor v15.16b, v5.16b, v9.16b + tbl v4.16b, {v14.16b}, v8.16b + eor v10.16b, v6.16b, v9.16b + tbl v5.16b, {v15.16b}, v8.16b + eor v11.16b, v7.16b, v9.16b + tbl v6.16b, {v10.16b}, v8.16b + tbl v7.16b, {v11.16b}, v8.16b + + bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 + + sub rounds, rounds, #1 + b .Ldec_sbox + +.Ldec_loop: + shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 +.Ldec_sbox: + inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ + v13, v14, v15 + subs rounds, rounds, #1 + b.cc .Ldec_done + + dec_next_rk + + add_round_key v0, v1, v6, v4, v2, v7, v3, v5 + + inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ + v13, v14, v15 + + b.ne .Ldec_loop + ldr q24, ISRM0 + b .Ldec_loop +.Ldec_done: + ldr q12, [bskey, #-16] // last round key + + bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 + + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v12.16b + eor v4.16b, v4.16b, v12.16b + eor v2.16b, v2.16b, v12.16b + eor v7.16b, v7.16b, v12.16b + eor v3.16b, v3.16b, v12.16b + eor v5.16b, v5.16b, v12.16b + ret +SYM_FUNC_END(aesbs_decrypt8) + + /* + * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + */ + .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 + frame_push 5 + + mov x19, x0 + mov x20, x1 + mov x21, x2 + mov x22, x3 + mov x23, x4 + +99: mov x5, #1 + lsl x5, x5, x23 + subs w23, w23, #8 + csel x23, x23, xzr, pl + csel x5, x5, xzr, mi + + ld1 {v0.16b}, [x20], #16 + tbnz x5, #1, 0f + ld1 {v1.16b}, [x20], #16 + tbnz x5, #2, 0f + ld1 {v2.16b}, [x20], #16 + tbnz x5, #3, 0f + ld1 {v3.16b}, [x20], #16 + tbnz x5, #4, 0f + ld1 {v4.16b}, [x20], #16 + tbnz x5, #5, 0f + ld1 {v5.16b}, [x20], #16 + tbnz x5, #6, 0f + ld1 {v6.16b}, [x20], #16 + tbnz x5, #7, 0f + ld1 {v7.16b}, [x20], #16 + +0: mov bskey, x21 + mov rounds, x22 + bl \do8 + + st1 {\o0\().16b}, [x19], #16 + tbnz x5, #1, 1f + st1 {\o1\().16b}, [x19], #16 + tbnz x5, #2, 1f + st1 {\o2\().16b}, [x19], #16 + tbnz x5, #3, 1f + st1 {\o3\().16b}, [x19], #16 + tbnz x5, #4, 1f + st1 {\o4\().16b}, [x19], #16 + tbnz x5, #5, 1f + st1 {\o5\().16b}, [x19], #16 + tbnz x5, #6, 1f + st1 {\o6\().16b}, [x19], #16 + tbnz x5, #7, 1f + st1 {\o7\().16b}, [x19], #16 + + cbz x23, 1f + b 99b + +1: frame_pop + ret + .endm + + .align 4 +SYM_TYPED_FUNC_START(aesbs_ecb_encrypt) + __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 +SYM_FUNC_END(aesbs_ecb_encrypt) + + .align 4 +SYM_TYPED_FUNC_START(aesbs_ecb_decrypt) + __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 +SYM_FUNC_END(aesbs_ecb_decrypt) + + /* + * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + */ + .align 4 +SYM_FUNC_START(aesbs_cbc_decrypt) + frame_push 6 + + mov x19, x0 + mov x20, x1 + mov x21, x2 + mov x22, x3 + mov x23, x4 + mov x24, x5 + +99: mov x6, #1 + lsl x6, x6, x23 + subs w23, w23, #8 + csel x23, x23, xzr, pl + csel x6, x6, xzr, mi + + ld1 {v0.16b}, [x20], #16 + mov v25.16b, v0.16b + tbnz x6, #1, 0f + ld1 {v1.16b}, [x20], #16 + mov v26.16b, v1.16b + tbnz x6, #2, 0f + ld1 {v2.16b}, [x20], #16 + mov v27.16b, v2.16b + tbnz x6, #3, 0f + ld1 {v3.16b}, [x20], #16 + mov v28.16b, v3.16b + tbnz x6, #4, 0f + ld1 {v4.16b}, [x20], #16 + mov v29.16b, v4.16b + tbnz x6, #5, 0f + ld1 {v5.16b}, [x20], #16 + mov v30.16b, v5.16b + tbnz x6, #6, 0f + ld1 {v6.16b}, [x20], #16 + mov v31.16b, v6.16b + tbnz x6, #7, 0f + ld1 {v7.16b}, [x20] + +0: mov bskey, x21 + mov rounds, x22 + bl aesbs_decrypt8 + + ld1 {v24.16b}, [x24] // load IV + + eor v1.16b, v1.16b, v25.16b + eor v6.16b, v6.16b, v26.16b + eor v4.16b, v4.16b, v27.16b + eor v2.16b, v2.16b, v28.16b + eor v7.16b, v7.16b, v29.16b + eor v0.16b, v0.16b, v24.16b + eor v3.16b, v3.16b, v30.16b + eor v5.16b, v5.16b, v31.16b + + st1 {v0.16b}, [x19], #16 + mov v24.16b, v25.16b + tbnz x6, #1, 1f + st1 {v1.16b}, [x19], #16 + mov v24.16b, v26.16b + tbnz x6, #2, 1f + st1 {v6.16b}, [x19], #16 + mov v24.16b, v27.16b + tbnz x6, #3, 1f + st1 {v4.16b}, [x19], #16 + mov v24.16b, v28.16b + tbnz x6, #4, 1f + st1 {v2.16b}, [x19], #16 + mov v24.16b, v29.16b + tbnz x6, #5, 1f + st1 {v7.16b}, [x19], #16 + mov v24.16b, v30.16b + tbnz x6, #6, 1f + st1 {v3.16b}, [x19], #16 + mov v24.16b, v31.16b + tbnz x6, #7, 1f + ld1 {v24.16b}, [x20], #16 + st1 {v5.16b}, [x19], #16 +1: st1 {v24.16b}, [x24] // store IV + + cbz x23, 2f + b 99b + +2: frame_pop + ret +SYM_FUNC_END(aesbs_cbc_decrypt) + + .macro next_tweak, out, in, const, tmp + sshr \tmp\().2d, \in\().2d, #63 + and \tmp\().16b, \tmp\().16b, \const\().16b + add \out\().2d, \in\().2d, \in\().2d + ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 + eor \out\().16b, \out\().16b, \tmp\().16b + .endm + + /* + * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + */ +SYM_FUNC_START_LOCAL(__xts_crypt8) + movi v18.2s, #0x1 + movi v19.2s, #0x87 + uzp1 v18.4s, v18.4s, v19.4s + + ld1 {v0.16b-v3.16b}, [x1], #64 + ld1 {v4.16b-v7.16b}, [x1], #64 + + next_tweak v26, v25, v18, v19 + next_tweak v27, v26, v18, v19 + next_tweak v28, v27, v18, v19 + next_tweak v29, v28, v18, v19 + next_tweak v30, v29, v18, v19 + next_tweak v31, v30, v18, v19 + next_tweak v16, v31, v18, v19 + next_tweak v17, v16, v18, v19 + + eor v0.16b, v0.16b, v25.16b + eor v1.16b, v1.16b, v26.16b + eor v2.16b, v2.16b, v27.16b + eor v3.16b, v3.16b, v28.16b + eor v4.16b, v4.16b, v29.16b + eor v5.16b, v5.16b, v30.16b + eor v6.16b, v6.16b, v31.16b + eor v7.16b, v7.16b, v16.16b + + stp q16, q17, [sp, #16] + + mov bskey, x2 + mov rounds, x3 + br x16 +SYM_FUNC_END(__xts_crypt8) + + .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 + stp x29, x30, [sp, #-48]! + mov x29, sp + + ld1 {v25.16b}, [x5] + +0: adr x16, \do8 + bl __xts_crypt8 + + eor v16.16b, \o0\().16b, v25.16b + eor v17.16b, \o1\().16b, v26.16b + eor v18.16b, \o2\().16b, v27.16b + eor v19.16b, \o3\().16b, v28.16b + + ldp q24, q25, [sp, #16] + + eor v20.16b, \o4\().16b, v29.16b + eor v21.16b, \o5\().16b, v30.16b + eor v22.16b, \o6\().16b, v31.16b + eor v23.16b, \o7\().16b, v24.16b + + st1 {v16.16b-v19.16b}, [x0], #64 + st1 {v20.16b-v23.16b}, [x0], #64 + + subs x4, x4, #8 + b.gt 0b + + st1 {v25.16b}, [x5] + ldp x29, x30, [sp], #48 + ret + .endm + +SYM_TYPED_FUNC_START(aesbs_xts_encrypt) + __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 +SYM_FUNC_END(aesbs_xts_encrypt) + +SYM_TYPED_FUNC_START(aesbs_xts_decrypt) + __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 +SYM_FUNC_END(aesbs_xts_decrypt) + + .macro next_ctr, v + mov \v\().d[1], x8 + adds x8, x8, #1 + mov \v\().d[0], x7 + adc x7, x7, xzr + rev64 \v\().16b, \v\().16b + .endm + + /* + * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], + * int rounds, int blocks, u8 iv[]) + */ +SYM_FUNC_START(aesbs_ctr_encrypt) + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldp x7, x8, [x5] + ld1 {v0.16b}, [x5] +CPU_LE( rev x7, x7 ) +CPU_LE( rev x8, x8 ) + adds x8, x8, #1 + adc x7, x7, xzr + +0: next_ctr v1 + next_ctr v2 + next_ctr v3 + next_ctr v4 + next_ctr v5 + next_ctr v6 + next_ctr v7 + + mov bskey, x2 + mov rounds, x3 + bl aesbs_encrypt8 + + ld1 { v8.16b-v11.16b}, [x1], #64 + ld1 {v12.16b-v15.16b}, [x1], #64 + + eor v8.16b, v0.16b, v8.16b + eor v9.16b, v1.16b, v9.16b + eor v10.16b, v4.16b, v10.16b + eor v11.16b, v6.16b, v11.16b + eor v12.16b, v3.16b, v12.16b + eor v13.16b, v7.16b, v13.16b + eor v14.16b, v2.16b, v14.16b + eor v15.16b, v5.16b, v15.16b + + st1 { v8.16b-v11.16b}, [x0], #64 + st1 {v12.16b-v15.16b}, [x0], #64 + + next_ctr v0 + subs x4, x4, #8 + b.gt 0b + + st1 {v0.16b}, [x5] + ldp x29, x30, [sp], #16 + ret +SYM_FUNC_END(aesbs_ctr_encrypt) diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c new file mode 100644 index 000000000..bac4cabef --- /dev/null +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Bit sliced AES using NEON instructions + * + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/aes.h> +#include <crypto/ctr.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <crypto/xts.h> +#include <linux/module.h> + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +MODULE_ALIAS_CRYPTO("ecb(aes)"); +MODULE_ALIAS_CRYPTO("cbc(aes)"); +MODULE_ALIAS_CRYPTO("ctr(aes)"); +MODULE_ALIAS_CRYPTO("xts(aes)"); + +asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds); + +asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks); +asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks); + +asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); + +asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); + +asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); +asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); + +/* borrowed from aes-neon-blk.ko */ +asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks); +asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks, u8 iv[]); +asmlinkage void neon_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[]); +asmlinkage void neon_aes_xts_encrypt(u8 out[], u8 const in[], + u32 const rk1[], int rounds, int bytes, + u32 const rk2[], u8 iv[], int first); +asmlinkage void neon_aes_xts_decrypt(u8 out[], u8 const in[], + u32 const rk1[], int rounds, int bytes, + u32 const rk2[], u8 iv[], int first); + +struct aesbs_ctx { + u8 rk[13 * (8 * AES_BLOCK_SIZE) + 32]; + int rounds; +} __aligned(AES_BLOCK_SIZE); + +struct aesbs_cbc_ctr_ctx { + struct aesbs_ctx key; + u32 enc[AES_MAX_KEYLENGTH_U32]; +}; + +struct aesbs_xts_ctx { + struct aesbs_ctx key; + u32 twkey[AES_MAX_KEYLENGTH_U32]; + struct crypto_aes_ctx cts; +}; + +static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_aes_ctx rk; + int err; + + err = aes_expandkey(&rk, in_key, key_len); + if (err) + return err; + + ctx->rounds = 6 + key_len / 4; + + kernel_neon_begin(); + aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds); + kernel_neon_end(); + + return 0; +} + +static int __ecb_crypt(struct skcipher_request *req, + void (*fn)(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks)) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; + + if (walk.nbytes < walk.total) + blocks = round_down(blocks, + walk.stride / AES_BLOCK_SIZE); + + kernel_neon_begin(); + fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk, + ctx->rounds, blocks); + kernel_neon_end(); + err = skcipher_walk_done(&walk, + walk.nbytes - blocks * AES_BLOCK_SIZE); + } + + return err; +} + +static int ecb_encrypt(struct skcipher_request *req) +{ + return __ecb_crypt(req, aesbs_ecb_encrypt); +} + +static int ecb_decrypt(struct skcipher_request *req) +{ + return __ecb_crypt(req, aesbs_ecb_decrypt); +} + +static int aesbs_cbc_ctr_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_aes_ctx rk; + int err; + + err = aes_expandkey(&rk, in_key, key_len); + if (err) + return err; + + ctx->key.rounds = 6 + key_len / 4; + + memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc)); + + kernel_neon_begin(); + aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds); + kernel_neon_end(); + memzero_explicit(&rk, sizeof(rk)); + + return 0; +} + +static int cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; + + /* fall back to the non-bitsliced NEON implementation */ + kernel_neon_begin(); + neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->enc, ctx->key.rounds, blocks, + walk.iv); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); + } + return err; +} + +static int cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; + + if (walk.nbytes < walk.total) + blocks = round_down(blocks, + walk.stride / AES_BLOCK_SIZE); + + kernel_neon_begin(); + aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key.rk, ctx->key.rounds, blocks, + walk.iv); + kernel_neon_end(); + err = skcipher_walk_done(&walk, + walk.nbytes - blocks * AES_BLOCK_SIZE); + } + + return err; +} + +static int ctr_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes > 0) { + int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7; + int nbytes = walk.nbytes % (8 * AES_BLOCK_SIZE); + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_neon_begin(); + if (blocks >= 8) { + aesbs_ctr_encrypt(dst, src, ctx->key.rk, ctx->key.rounds, + blocks, walk.iv); + dst += blocks * AES_BLOCK_SIZE; + src += blocks * AES_BLOCK_SIZE; + } + if (nbytes && walk.nbytes == walk.total) { + neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds, + nbytes, walk.iv); + nbytes = 0; + } + kernel_neon_end(); + err = skcipher_walk_done(&walk, nbytes); + } + return err; +} + +static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_aes_ctx rk; + int err; + + err = xts_verify_key(tfm, in_key, key_len); + if (err) + return err; + + key_len /= 2; + err = aes_expandkey(&ctx->cts, in_key, key_len); + if (err) + return err; + + err = aes_expandkey(&rk, in_key + key_len, key_len); + if (err) + return err; + + memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey)); + + return aesbs_setkey(tfm, in_key, key_len); +} + +static int __xts_crypt(struct skcipher_request *req, bool encrypt, + void (*fn)(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[])) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int tail = req->cryptlen % (8 * AES_BLOCK_SIZE); + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct scatterlist *src, *dst; + struct skcipher_walk walk; + int nbytes, err; + int first = 1; + u8 *out, *in; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + /* ensure that the cts tail is covered by a single step */ + if (unlikely(tail > 0 && tail < AES_BLOCK_SIZE)) { + int xts_blocks = DIV_ROUND_UP(req->cryptlen, + AES_BLOCK_SIZE) - 2; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + skcipher_request_flags(req), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + xts_blocks * AES_BLOCK_SIZE, + req->iv); + req = &subreq; + } else { + tail = 0; + } + + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7; + out = walk.dst.virt.addr; + in = walk.src.virt.addr; + nbytes = walk.nbytes; + + kernel_neon_begin(); + if (blocks >= 8) { + if (first == 1) + neon_aes_ecb_encrypt(walk.iv, walk.iv, + ctx->twkey, + ctx->key.rounds, 1); + first = 2; + + fn(out, in, ctx->key.rk, ctx->key.rounds, blocks, + walk.iv); + + out += blocks * AES_BLOCK_SIZE; + in += blocks * AES_BLOCK_SIZE; + nbytes -= blocks * AES_BLOCK_SIZE; + } + if (walk.nbytes == walk.total && nbytes > 0) { + if (encrypt) + neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, + ctx->key.rounds, nbytes, + ctx->twkey, walk.iv, first); + else + neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, + ctx->key.rounds, nbytes, + ctx->twkey, walk.iv, first); + nbytes = first = 0; + } + kernel_neon_end(); + err = skcipher_walk_done(&walk, nbytes); + } + + if (err || likely(!tail)) + return err; + + /* handle ciphertext stealing */ + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); + + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; + + out = walk.dst.virt.addr; + in = walk.src.virt.addr; + nbytes = walk.nbytes; + + kernel_neon_begin(); + if (encrypt) + neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds, + nbytes, ctx->twkey, walk.iv, first); + else + neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds, + nbytes, ctx->twkey, walk.iv, first); + kernel_neon_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int xts_encrypt(struct skcipher_request *req) +{ + return __xts_crypt(req, true, aesbs_xts_encrypt); +} + +static int xts_decrypt(struct skcipher_request *req) +{ + return __xts_crypt(req, false, aesbs_xts_decrypt); +} + +static struct skcipher_alg aes_algs[] = { { + .base.cra_name = "ecb(aes)", + .base.cra_driver_name = "ecb-aes-neonbs", + .base.cra_priority = 250, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct aesbs_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, + .setkey = aesbs_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, +}, { + .base.cra_name = "cbc(aes)", + .base.cra_driver_name = "cbc-aes-neonbs", + .base.cra_priority = 250, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_cbc_ctr_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, +}, { + .base.cra_name = "ctr(aes)", + .base.cra_driver_name = "ctr-aes-neonbs", + .base.cra_priority = 250, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .chunksize = AES_BLOCK_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_cbc_ctr_setkey, + .encrypt = ctr_encrypt, + .decrypt = ctr_encrypt, +}, { + .base.cra_name = "xts(aes)", + .base.cra_driver_name = "xts-aes-neonbs", + .base.cra_priority = 250, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct aesbs_xts_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_xts_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, +} }; + +static void aes_exit(void) +{ + crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); +} + +static int __init aes_init(void) +{ + if (!cpu_have_named_feature(ASIMD)) + return -ENODEV; + + return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); +} + +module_init(aes_init); +module_exit(aes_exit); diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S new file mode 100644 index 000000000..b70ac76f2 --- /dev/null +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -0,0 +1,805 @@ +/* + * ChaCha/XChaCha NEON helper functions + * + * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Originally based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + + .text + .align 6 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers v0-v3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in w3. + * + * Clobbers: w3, x10, v4, v12 + */ +SYM_FUNC_START_LOCAL(chacha_permute) + + adr_l x10, ROT8 + ld1 {v12.4s}, [x10] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + ext v1.16b, v1.16b, v1.16b, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + ext v3.16b, v3.16b, v3.16b, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + ext v1.16b, v1.16b, v1.16b, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + ext v3.16b, v3.16b, v3.16b, #4 + + subs w3, w3, #2 + b.ne .Ldoubleround + + ret +SYM_FUNC_END(chacha_permute) + +SYM_FUNC_START(chacha_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + // w3: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + // x0..3 = s0..3 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + + bl chacha_permute + + ld1 {v4.16b-v7.16b}, [x2] + + // o0 = i0 ^ (x0 + s0) + add v0.4s, v0.4s, v8.4s + eor v0.16b, v0.16b, v4.16b + + // o1 = i1 ^ (x1 + s1) + add v1.4s, v1.4s, v9.4s + eor v1.16b, v1.16b, v5.16b + + // o2 = i2 ^ (x2 + s2) + add v2.4s, v2.4s, v10.4s + eor v2.16b, v2.16b, v6.16b + + // o3 = i3 ^ (x3 + s3) + add v3.4s, v3.4s, v11.4s + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1] + + ldp x29, x30, [sp], #16 + ret +SYM_FUNC_END(chacha_block_xor_neon) + +SYM_FUNC_START(hchacha_block_neon) + // x0: Input state matrix, s + // x1: output (8 32-bit words) + // w2: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + ld1 {v0.4s-v3.4s}, [x0] + + mov w3, w2 + bl chacha_permute + + st1 {v0.4s}, [x1], #16 + st1 {v3.4s}, [x1] + + ldp x29, x30, [sp], #16 + ret +SYM_FUNC_END(hchacha_block_neon) + + a0 .req w12 + a1 .req w13 + a2 .req w14 + a3 .req w15 + a4 .req w16 + a5 .req w17 + a6 .req w19 + a7 .req w20 + a8 .req w21 + a9 .req w22 + a10 .req w23 + a11 .req w24 + a12 .req w25 + a13 .req w26 + a14 .req w27 + a15 .req w28 + + .align 6 +SYM_FUNC_START(chacha_4block_xor_neon) + frame_push 10 + + // x0: Input state matrix, s + // x1: 4 data blocks output, o + // x2: 4 data blocks input, i + // w3: nrounds + // x4: byte count + + adr_l x10, .Lpermute + and x5, x4, #63 + add x10, x10, x5 + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. For final XORing step we transpose the + // matrix by interleaving 32- and then 64-bit words, which allows us to + // do XOR in NEON registers. + // + // At the same time, a fifth block is encrypted in parallel using + // scalar registers + // + adr_l x9, CTRINC // ... and ROT8 + ld1 {v30.4s-v31.4s}, [x9] + + // x0..15[0-3] = s0..3[0..3] + add x8, x0, #16 + ld4r { v0.4s- v3.4s}, [x0] + ld4r { v4.4s- v7.4s}, [x8], #16 + ld4r { v8.4s-v11.4s}, [x8], #16 + ld4r {v12.4s-v15.4s}, [x8] + + mov a0, v0.s[0] + mov a1, v1.s[0] + mov a2, v2.s[0] + mov a3, v3.s[0] + mov a4, v4.s[0] + mov a5, v5.s[0] + mov a6, v6.s[0] + mov a7, v7.s[0] + mov a8, v8.s[0] + mov a9, v9.s[0] + mov a10, v10.s[0] + mov a11, v11.s[0] + mov a12, v12.s[0] + mov a13, v13.s[0] + mov a14, v14.s[0] + mov a15, v15.s[0] + + // x12 += counter values 1-4 + add v12.4s, v12.4s, v30.4s + +.Ldoubleround4: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + add v0.4s, v0.4s, v4.4s + add a0, a0, a4 + add v1.4s, v1.4s, v5.4s + add a1, a1, a5 + add v2.4s, v2.4s, v6.4s + add a2, a2, a6 + add v3.4s, v3.4s, v7.4s + add a3, a3, a7 + + eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 + eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 + eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 + eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 + + rev32 v12.8h, v12.8h + ror a12, a12, #16 + rev32 v13.8h, v13.8h + ror a13, a13, #16 + rev32 v14.8h, v14.8h + ror a14, a14, #16 + rev32 v15.8h, v15.8h + ror a15, a15, #16 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + add v8.4s, v8.4s, v12.4s + add a8, a8, a12 + add v9.4s, v9.4s, v13.4s + add a9, a9, a13 + add v10.4s, v10.4s, v14.4s + add a10, a10, a14 + add v11.4s, v11.4s, v15.4s + add a11, a11, a15 + + eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 + eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 + eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 + eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 + + shl v4.4s, v16.4s, #12 + shl v5.4s, v17.4s, #12 + shl v6.4s, v18.4s, #12 + shl v7.4s, v19.4s, #12 + + sri v4.4s, v16.4s, #20 + ror a4, a4, #20 + sri v5.4s, v17.4s, #20 + ror a5, a5, #20 + sri v6.4s, v18.4s, #20 + ror a6, a6, #20 + sri v7.4s, v19.4s, #20 + ror a7, a7, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + add v0.4s, v0.4s, v4.4s + add a0, a0, a4 + add v1.4s, v1.4s, v5.4s + add a1, a1, a5 + add v2.4s, v2.4s, v6.4s + add a2, a2, a6 + add v3.4s, v3.4s, v7.4s + add a3, a3, a7 + + eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 + eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 + eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 + eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 + + tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 + tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 + tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 + tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + add v8.4s, v8.4s, v12.4s + add a8, a8, a12 + add v9.4s, v9.4s, v13.4s + add a9, a9, a13 + add v10.4s, v10.4s, v14.4s + add a10, a10, a14 + add v11.4s, v11.4s, v15.4s + add a11, a11, a15 + + eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 + eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 + eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 + eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 + + shl v4.4s, v16.4s, #7 + shl v5.4s, v17.4s, #7 + shl v6.4s, v18.4s, #7 + shl v7.4s, v19.4s, #7 + + sri v4.4s, v16.4s, #25 + ror a4, a4, #25 + sri v5.4s, v17.4s, #25 + ror a5, a5, #25 + sri v6.4s, v18.4s, #25 + ror a6, a6, #25 + sri v7.4s, v19.4s, #25 + ror a7, a7, #25 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + add v0.4s, v0.4s, v5.4s + add a0, a0, a5 + add v1.4s, v1.4s, v6.4s + add a1, a1, a6 + add v2.4s, v2.4s, v7.4s + add a2, a2, a7 + add v3.4s, v3.4s, v4.4s + add a3, a3, a4 + + eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 + eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 + eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 + eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 + + rev32 v15.8h, v15.8h + ror a15, a15, #16 + rev32 v12.8h, v12.8h + ror a12, a12, #16 + rev32 v13.8h, v13.8h + ror a13, a13, #16 + rev32 v14.8h, v14.8h + ror a14, a14, #16 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + add v10.4s, v10.4s, v15.4s + add a10, a10, a15 + add v11.4s, v11.4s, v12.4s + add a11, a11, a12 + add v8.4s, v8.4s, v13.4s + add a8, a8, a13 + add v9.4s, v9.4s, v14.4s + add a9, a9, a14 + + eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 + eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 + eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 + eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 + + shl v5.4s, v16.4s, #12 + shl v6.4s, v17.4s, #12 + shl v7.4s, v18.4s, #12 + shl v4.4s, v19.4s, #12 + + sri v5.4s, v16.4s, #20 + ror a5, a5, #20 + sri v6.4s, v17.4s, #20 + ror a6, a6, #20 + sri v7.4s, v18.4s, #20 + ror a7, a7, #20 + sri v4.4s, v19.4s, #20 + ror a4, a4, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + add v0.4s, v0.4s, v5.4s + add a0, a0, a5 + add v1.4s, v1.4s, v6.4s + add a1, a1, a6 + add v2.4s, v2.4s, v7.4s + add a2, a2, a7 + add v3.4s, v3.4s, v4.4s + add a3, a3, a4 + + eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 + eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 + eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 + eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 + + tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 + tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 + tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 + tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + add v10.4s, v10.4s, v15.4s + add a10, a10, a15 + add v11.4s, v11.4s, v12.4s + add a11, a11, a12 + add v8.4s, v8.4s, v13.4s + add a8, a8, a13 + add v9.4s, v9.4s, v14.4s + add a9, a9, a14 + + eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 + eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 + eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 + eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 + + shl v5.4s, v16.4s, #7 + shl v6.4s, v17.4s, #7 + shl v7.4s, v18.4s, #7 + shl v4.4s, v19.4s, #7 + + sri v5.4s, v16.4s, #25 + ror a5, a5, #25 + sri v6.4s, v17.4s, #25 + ror a6, a6, #25 + sri v7.4s, v18.4s, #25 + ror a7, a7, #25 + sri v4.4s, v19.4s, #25 + ror a4, a4, #25 + + subs w3, w3, #2 + b.ne .Ldoubleround4 + + ld4r {v16.4s-v19.4s}, [x0], #16 + ld4r {v20.4s-v23.4s}, [x0], #16 + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + + // x0[0-3] += s0[0] + // x1[0-3] += s0[1] + // x2[0-3] += s0[2] + // x3[0-3] += s0[3] + add v0.4s, v0.4s, v16.4s + mov w6, v16.s[0] + mov w7, v17.s[0] + add v1.4s, v1.4s, v17.4s + mov w8, v18.s[0] + mov w9, v19.s[0] + add v2.4s, v2.4s, v18.4s + add a0, a0, w6 + add a1, a1, w7 + add v3.4s, v3.4s, v19.4s + add a2, a2, w8 + add a3, a3, w9 +CPU_BE( rev a0, a0 ) +CPU_BE( rev a1, a1 ) +CPU_BE( rev a2, a2 ) +CPU_BE( rev a3, a3 ) + + ld4r {v24.4s-v27.4s}, [x0], #16 + ld4r {v28.4s-v31.4s}, [x0] + + // x4[0-3] += s1[0] + // x5[0-3] += s1[1] + // x6[0-3] += s1[2] + // x7[0-3] += s1[3] + add v4.4s, v4.4s, v20.4s + mov w6, v20.s[0] + mov w7, v21.s[0] + add v5.4s, v5.4s, v21.4s + mov w8, v22.s[0] + mov w9, v23.s[0] + add v6.4s, v6.4s, v22.4s + add a4, a4, w6 + add a5, a5, w7 + add v7.4s, v7.4s, v23.4s + add a6, a6, w8 + add a7, a7, w9 +CPU_BE( rev a4, a4 ) +CPU_BE( rev a5, a5 ) +CPU_BE( rev a6, a6 ) +CPU_BE( rev a7, a7 ) + + // x8[0-3] += s2[0] + // x9[0-3] += s2[1] + // x10[0-3] += s2[2] + // x11[0-3] += s2[3] + add v8.4s, v8.4s, v24.4s + mov w6, v24.s[0] + mov w7, v25.s[0] + add v9.4s, v9.4s, v25.4s + mov w8, v26.s[0] + mov w9, v27.s[0] + add v10.4s, v10.4s, v26.4s + add a8, a8, w6 + add a9, a9, w7 + add v11.4s, v11.4s, v27.4s + add a10, a10, w8 + add a11, a11, w9 +CPU_BE( rev a8, a8 ) +CPU_BE( rev a9, a9 ) +CPU_BE( rev a10, a10 ) +CPU_BE( rev a11, a11 ) + + // x12[0-3] += s3[0] + // x13[0-3] += s3[1] + // x14[0-3] += s3[2] + // x15[0-3] += s3[3] + add v12.4s, v12.4s, v28.4s + mov w6, v28.s[0] + mov w7, v29.s[0] + add v13.4s, v13.4s, v29.4s + mov w8, v30.s[0] + mov w9, v31.s[0] + add v14.4s, v14.4s, v30.4s + add a12, a12, w6 + add a13, a13, w7 + add v15.4s, v15.4s, v31.4s + add a14, a14, w8 + add a15, a15, w9 +CPU_BE( rev a12, a12 ) +CPU_BE( rev a13, a13 ) +CPU_BE( rev a14, a14 ) +CPU_BE( rev a15, a15 ) + + // interleave 32-bit words in state n, n+1 + ldp w6, w7, [x2], #64 + zip1 v16.4s, v0.4s, v1.4s + ldp w8, w9, [x2, #-56] + eor a0, a0, w6 + zip2 v17.4s, v0.4s, v1.4s + eor a1, a1, w7 + zip1 v18.4s, v2.4s, v3.4s + eor a2, a2, w8 + zip2 v19.4s, v2.4s, v3.4s + eor a3, a3, w9 + ldp w6, w7, [x2, #-48] + zip1 v20.4s, v4.4s, v5.4s + ldp w8, w9, [x2, #-40] + eor a4, a4, w6 + zip2 v21.4s, v4.4s, v5.4s + eor a5, a5, w7 + zip1 v22.4s, v6.4s, v7.4s + eor a6, a6, w8 + zip2 v23.4s, v6.4s, v7.4s + eor a7, a7, w9 + ldp w6, w7, [x2, #-32] + zip1 v24.4s, v8.4s, v9.4s + ldp w8, w9, [x2, #-24] + eor a8, a8, w6 + zip2 v25.4s, v8.4s, v9.4s + eor a9, a9, w7 + zip1 v26.4s, v10.4s, v11.4s + eor a10, a10, w8 + zip2 v27.4s, v10.4s, v11.4s + eor a11, a11, w9 + ldp w6, w7, [x2, #-16] + zip1 v28.4s, v12.4s, v13.4s + ldp w8, w9, [x2, #-8] + eor a12, a12, w6 + zip2 v29.4s, v12.4s, v13.4s + eor a13, a13, w7 + zip1 v30.4s, v14.4s, v15.4s + eor a14, a14, w8 + zip2 v31.4s, v14.4s, v15.4s + eor a15, a15, w9 + + add x3, x2, x4 + sub x3, x3, #128 // start of last block + + subs x5, x4, #128 + csel x2, x2, x3, ge + + // interleave 64-bit words in state n, n+2 + zip1 v0.2d, v16.2d, v18.2d + zip2 v4.2d, v16.2d, v18.2d + stp a0, a1, [x1], #64 + zip1 v8.2d, v17.2d, v19.2d + zip2 v12.2d, v17.2d, v19.2d + stp a2, a3, [x1, #-56] + + subs x6, x4, #192 + ld1 {v16.16b-v19.16b}, [x2], #64 + csel x2, x2, x3, ge + + zip1 v1.2d, v20.2d, v22.2d + zip2 v5.2d, v20.2d, v22.2d + stp a4, a5, [x1, #-48] + zip1 v9.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + stp a6, a7, [x1, #-40] + + subs x7, x4, #256 + ld1 {v20.16b-v23.16b}, [x2], #64 + csel x2, x2, x3, ge + + zip1 v2.2d, v24.2d, v26.2d + zip2 v6.2d, v24.2d, v26.2d + stp a8, a9, [x1, #-32] + zip1 v10.2d, v25.2d, v27.2d + zip2 v14.2d, v25.2d, v27.2d + stp a10, a11, [x1, #-24] + + subs x8, x4, #320 + ld1 {v24.16b-v27.16b}, [x2], #64 + csel x2, x2, x3, ge + + zip1 v3.2d, v28.2d, v30.2d + zip2 v7.2d, v28.2d, v30.2d + stp a12, a13, [x1, #-16] + zip1 v11.2d, v29.2d, v31.2d + zip2 v15.2d, v29.2d, v31.2d + stp a14, a15, [x1, #-8] + + tbnz x5, #63, .Lt128 + ld1 {v28.16b-v31.16b}, [x2] + + // xor with corresponding input, write to output + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + eor v18.16b, v18.16b, v2.16b + eor v19.16b, v19.16b, v3.16b + + tbnz x6, #63, .Lt192 + + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + + st1 {v16.16b-v19.16b}, [x1], #64 + tbnz x7, #63, .Lt256 + + eor v24.16b, v24.16b, v8.16b + eor v25.16b, v25.16b, v9.16b + eor v26.16b, v26.16b, v10.16b + eor v27.16b, v27.16b, v11.16b + + st1 {v20.16b-v23.16b}, [x1], #64 + tbnz x8, #63, .Lt320 + + eor v28.16b, v28.16b, v12.16b + eor v29.16b, v29.16b, v13.16b + eor v30.16b, v30.16b, v14.16b + eor v31.16b, v31.16b, v15.16b + + st1 {v24.16b-v27.16b}, [x1], #64 + st1 {v28.16b-v31.16b}, [x1] + +.Lout: frame_pop + ret + + // fewer than 192 bytes of in/output +.Lt192: cbz x5, 1f // exactly 128 bytes? + ld1 {v28.16b-v31.16b}, [x10] + add x5, x5, x1 + tbl v28.16b, {v4.16b-v7.16b}, v28.16b + tbl v29.16b, {v4.16b-v7.16b}, v29.16b + tbl v30.16b, {v4.16b-v7.16b}, v30.16b + tbl v31.16b, {v4.16b-v7.16b}, v31.16b + +0: eor v20.16b, v20.16b, v28.16b + eor v21.16b, v21.16b, v29.16b + eor v22.16b, v22.16b, v30.16b + eor v23.16b, v23.16b, v31.16b + st1 {v20.16b-v23.16b}, [x5] // overlapping stores +1: st1 {v16.16b-v19.16b}, [x1] + b .Lout + + // fewer than 128 bytes of in/output +.Lt128: ld1 {v28.16b-v31.16b}, [x10] + add x5, x5, x1 + sub x1, x1, #64 + tbl v28.16b, {v0.16b-v3.16b}, v28.16b + tbl v29.16b, {v0.16b-v3.16b}, v29.16b + tbl v30.16b, {v0.16b-v3.16b}, v30.16b + tbl v31.16b, {v0.16b-v3.16b}, v31.16b + ld1 {v16.16b-v19.16b}, [x1] // reload first output block + b 0b + + // fewer than 256 bytes of in/output +.Lt256: cbz x6, 2f // exactly 192 bytes? + ld1 {v4.16b-v7.16b}, [x10] + add x6, x6, x1 + tbl v0.16b, {v8.16b-v11.16b}, v4.16b + tbl v1.16b, {v8.16b-v11.16b}, v5.16b + tbl v2.16b, {v8.16b-v11.16b}, v6.16b + tbl v3.16b, {v8.16b-v11.16b}, v7.16b + + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + st1 {v28.16b-v31.16b}, [x6] // overlapping stores +2: st1 {v20.16b-v23.16b}, [x1] + b .Lout + + // fewer than 320 bytes of in/output +.Lt320: cbz x7, 3f // exactly 256 bytes? + ld1 {v4.16b-v7.16b}, [x10] + add x7, x7, x1 + tbl v0.16b, {v12.16b-v15.16b}, v4.16b + tbl v1.16b, {v12.16b-v15.16b}, v5.16b + tbl v2.16b, {v12.16b-v15.16b}, v6.16b + tbl v3.16b, {v12.16b-v15.16b}, v7.16b + + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + st1 {v28.16b-v31.16b}, [x7] // overlapping stores +3: st1 {v24.16b-v27.16b}, [x1] + b .Lout +SYM_FUNC_END(chacha_4block_xor_neon) + + .section ".rodata", "a", %progbits + .align L1_CACHE_SHIFT +.Lpermute: + .set .Li, 0 + .rept 128 + .byte (.Li - 64) + .set .Li, .Li + 1 + .endr + +CTRINC: .word 1, 2, 3, 4 +ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c new file mode 100644 index 000000000..af2bbca38 --- /dev/null +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -0,0 +1,243 @@ +/* + * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/algapi.h> +#include <crypto/internal/chacha.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src, + int nrounds, int bytes); +asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, + int bytes, int nrounds) +{ + while (bytes > 0) { + int l = min(bytes, CHACHA_BLOCK_SIZE * 5); + + if (l <= CHACHA_BLOCK_SIZE) { + u8 buf[CHACHA_BLOCK_SIZE]; + + memcpy(buf, src, l); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, l); + state[12] += 1; + break; + } + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= l; + src += l; + dst += l; + state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); + } +} + +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { + hchacha_block_generic(state, stream, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, stream, nrounds); + kernel_neon_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static int chacha_neon_stream_xor(struct skcipher_request *req, + const struct chacha_ctx *ctx, const u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + chacha_init_generic(state, ctx->key, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = rounddown(nbytes, walk.stride); + + if (!static_branch_likely(&have_neon) || + !crypto_simd_usable()) { + chacha_crypt_generic(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + } else { + kernel_neon_begin(); + chacha_doneon(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, ctx->nrounds); + kernel_neon_end(); + } + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int chacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + return chacha_neon_stream_xor(req, ctx, req->iv); +} + +static int xchacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + chacha_init_generic(state, ctx->key, req->iv); + hchacha_block_arch(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha_neon_stream_xor(req, &subctx, real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = chacha_neon, + .decrypt = chacha_neon, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, + .setkey = chacha12_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + } +}; + +static int __init chacha_simd_mod_init(void) +{ + if (!cpu_have_named_feature(ASIMD)) + return 0; + + static_branch_enable(&have_neon); + + return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? + crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; +} + +static void __exit chacha_simd_mod_fini(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && cpu_have_named_feature(ASIMD)) + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-neon"); diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S new file mode 100644 index 000000000..dce6dcebf --- /dev/null +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -0,0 +1,515 @@ +// +// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions +// +// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> +// Copyright (C) 2019 Google LLC <ebiggers@google.com> +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2 as +// published by the Free Software Foundation. +// + +// Derived from the x86 version: +// +// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +// +// Copyright (c) 2013, Intel Corporation +// +// Authors: +// Erdinc Ozturk <erdinc.ozturk@intel.com> +// Vinodh Gopal <vinodh.gopal@intel.com> +// James Guilford <james.guilford@intel.com> +// Tim Chen <tim.c.chen@linux.intel.com> +// +// This software is available to you under a choice of one of two +// licenses. You may choose to be licensed under the terms of the GNU +// General Public License (GPL) Version 2, available from the file +// COPYING in the main directory of this source tree, or the +// OpenIB.org BSD license below: +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// * Neither the name of the Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// +// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Reference paper titled "Fast CRC Computation for Generic +// Polynomials Using PCLMULQDQ Instruction" +// URL: http://www.intel.com/content/dam/www/public/us/en/documents +// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + init_crc .req w0 + buf .req x1 + len .req x2 + fold_consts_ptr .req x3 + + fold_consts .req v10 + + ad .req v14 + + k00_16 .req v15 + k32_48 .req v16 + + t3 .req v17 + t4 .req v18 + t5 .req v19 + t6 .req v20 + t7 .req v21 + t8 .req v22 + t9 .req v23 + + perm1 .req v24 + perm2 .req v25 + perm3 .req v26 + perm4 .req v27 + + bd1 .req v28 + bd2 .req v29 + bd3 .req v30 + bd4 .req v31 + + .macro __pmull_init_p64 + .endm + + .macro __pmull_pre_p64, bd + .endm + + .macro __pmull_init_p8 + // k00_16 := 0x0000000000000000_000000000000ffff + // k32_48 := 0x00000000ffffffff_0000ffffffffffff + movi k32_48.2d, #0xffffffff + mov k32_48.h[2], k32_48.h[0] + ushr k00_16.2d, k32_48.2d, #32 + + // prepare the permutation vectors + mov_q x5, 0x080f0e0d0c0b0a09 + movi perm4.8b, #8 + dup perm1.2d, x5 + eor perm1.16b, perm1.16b, perm4.16b + ushr perm2.2d, perm1.2d, #8 + ushr perm3.2d, perm1.2d, #16 + ushr perm4.2d, perm1.2d, #24 + sli perm2.2d, perm1.2d, #56 + sli perm3.2d, perm1.2d, #48 + sli perm4.2d, perm1.2d, #40 + .endm + + .macro __pmull_pre_p8, bd + tbl bd1.16b, {\bd\().16b}, perm1.16b + tbl bd2.16b, {\bd\().16b}, perm2.16b + tbl bd3.16b, {\bd\().16b}, perm3.16b + tbl bd4.16b, {\bd\().16b}, perm4.16b + .endm + +SYM_FUNC_START_LOCAL(__pmull_p8_core) +.L__pmull_p8_core: + ext t4.8b, ad.8b, ad.8b, #1 // A1 + ext t5.8b, ad.8b, ad.8b, #2 // A2 + ext t6.8b, ad.8b, ad.8b, #3 // A3 + + pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B + pmull t8.8h, ad.8b, bd1.8b // E = A*B1 + pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B + pmull t7.8h, ad.8b, bd2.8b // G = A*B2 + pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B + pmull t9.8h, ad.8b, bd3.8b // I = A*B3 + pmull t3.8h, ad.8b, bd4.8b // K = A*B4 + b 0f + +.L__pmull_p8_core2: + tbl t4.16b, {ad.16b}, perm1.16b // A1 + tbl t5.16b, {ad.16b}, perm2.16b // A2 + tbl t6.16b, {ad.16b}, perm3.16b // A3 + + pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B + pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1 + pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B + pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2 + pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B + pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3 + pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4 + +0: eor t4.16b, t4.16b, t8.16b // L = E + F + eor t5.16b, t5.16b, t7.16b // M = G + H + eor t6.16b, t6.16b, t9.16b // N = I + J + + uzp1 t8.2d, t4.2d, t5.2d + uzp2 t4.2d, t4.2d, t5.2d + uzp1 t7.2d, t6.2d, t3.2d + uzp2 t6.2d, t6.2d, t3.2d + + // t4 = (L) (P0 + P1) << 8 + // t5 = (M) (P2 + P3) << 16 + eor t8.16b, t8.16b, t4.16b + and t4.16b, t4.16b, k32_48.16b + + // t6 = (N) (P4 + P5) << 24 + // t7 = (K) (P6 + P7) << 32 + eor t7.16b, t7.16b, t6.16b + and t6.16b, t6.16b, k00_16.16b + + eor t8.16b, t8.16b, t4.16b + eor t7.16b, t7.16b, t6.16b + + zip2 t5.2d, t8.2d, t4.2d + zip1 t4.2d, t8.2d, t4.2d + zip2 t3.2d, t7.2d, t6.2d + zip1 t6.2d, t7.2d, t6.2d + + ext t4.16b, t4.16b, t4.16b, #15 + ext t5.16b, t5.16b, t5.16b, #14 + ext t6.16b, t6.16b, t6.16b, #13 + ext t3.16b, t3.16b, t3.16b, #12 + + eor t4.16b, t4.16b, t5.16b + eor t6.16b, t6.16b, t3.16b + ret +SYM_FUNC_END(__pmull_p8_core) + + .macro __pmull_p8, rq, ad, bd, i + .ifnc \bd, fold_consts + .err + .endif + mov ad.16b, \ad\().16b + .ifb \i + pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B + .else + pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B + .endif + + bl .L__pmull_p8_core\i + + eor \rq\().16b, \rq\().16b, t4.16b + eor \rq\().16b, \rq\().16b, t6.16b + .endm + + // Fold reg1, reg2 into the next 32 data bytes, storing the result back + // into reg1, reg2. + .macro fold_32_bytes, p, reg1, reg2 + ldp q11, q12, [buf], #0x20 + + __pmull_\p v8, \reg1, fold_consts, 2 + __pmull_\p \reg1, \reg1, fold_consts + +CPU_LE( rev64 v11.16b, v11.16b ) +CPU_LE( rev64 v12.16b, v12.16b ) + + __pmull_\p v9, \reg2, fold_consts, 2 + __pmull_\p \reg2, \reg2, fold_consts + +CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) +CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) + + eor \reg1\().16b, \reg1\().16b, v8.16b + eor \reg2\().16b, \reg2\().16b, v9.16b + eor \reg1\().16b, \reg1\().16b, v11.16b + eor \reg2\().16b, \reg2\().16b, v12.16b + .endm + + // Fold src_reg into dst_reg, optionally loading the next fold constants + .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts + __pmull_\p v8, \src_reg, fold_consts + __pmull_\p \src_reg, \src_reg, fold_consts, 2 + .ifnb \load_next_consts + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + __pmull_pre_\p fold_consts + .endif + eor \dst_reg\().16b, \dst_reg\().16b, v8.16b + eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b + .endm + + .macro __pmull_p64, rd, rn, rm, n + .ifb \n + pmull \rd\().1q, \rn\().1d, \rm\().1d + .else + pmull2 \rd\().1q, \rn\().2d, \rm\().2d + .endif + .endm + + .macro crc_t10dif_pmull, p + __pmull_init_\p + + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. + cmp len, #256 + b.lt .Lless_than_256_bytes_\@ + + adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts + + // Load the first 128 data bytes. Byte swapping is necessary to make + // the bit order match the polynomial coefficient order. + ldp q0, q1, [buf] + ldp q2, q3, [buf, #0x20] + ldp q4, q5, [buf, #0x40] + ldp q6, q7, [buf, #0x60] + add buf, buf, #0x80 +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( rev64 v1.16b, v1.16b ) +CPU_LE( rev64 v2.16b, v2.16b ) +CPU_LE( rev64 v3.16b, v3.16b ) +CPU_LE( rev64 v4.16b, v4.16b ) +CPU_LE( rev64 v5.16b, v5.16b ) +CPU_LE( rev64 v6.16b, v6.16b ) +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) +CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) +CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) +CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) +CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) +CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) +CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + + // XOR the first 16 data *bits* with the initial CRC value. + movi v8.16b, #0 + mov v8.h[7], init_crc + eor v0.16b, v0.16b, v8.16b + + // Load the constants for folding across 128 bytes. + ld1 {fold_consts.2d}, [fold_consts_ptr] + __pmull_pre_\p fold_consts + + // Subtract 128 for the 128 data bytes just consumed. Subtract another + // 128 to simplify the termination condition of the following loop. + sub len, len, #256 + + // While >= 128 data bytes remain (not counting v0-v7), fold the 128 + // bytes v0-v7 into them, storing the result back into v0-v7. +.Lfold_128_bytes_loop_\@: + fold_32_bytes \p, v0, v1 + fold_32_bytes \p, v2, v3 + fold_32_bytes \p, v4, v5 + fold_32_bytes \p, v6, v7 + + subs len, len, #128 + b.ge .Lfold_128_bytes_loop_\@ + + // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. + + // Fold across 64 bytes. + add fold_consts_ptr, fold_consts_ptr, #16 + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + __pmull_pre_\p fold_consts + fold_16_bytes \p, v0, v4 + fold_16_bytes \p, v1, v5 + fold_16_bytes \p, v2, v6 + fold_16_bytes \p, v3, v7, 1 + // Fold across 32 bytes. + fold_16_bytes \p, v4, v6 + fold_16_bytes \p, v5, v7, 1 + // Fold across 16 bytes. + fold_16_bytes \p, v6, v7 + + // Add 128 to get the correct number of data bytes remaining in 0...127 + // (not counting v7), following the previous extra subtraction by 128. + // Then subtract 16 to simplify the termination condition of the + // following loop. + adds len, len, #(128-16) + + // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 + // into them, storing the result back into v7. + b.lt .Lfold_16_bytes_loop_done_\@ +.Lfold_16_bytes_loop_\@: + __pmull_\p v8, v7, fold_consts + __pmull_\p v7, v7, fold_consts, 2 + eor v7.16b, v7.16b, v8.16b + ldr q0, [buf], #16 +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + eor v7.16b, v7.16b, v0.16b + subs len, len, #16 + b.ge .Lfold_16_bytes_loop_\@ + +.Lfold_16_bytes_loop_done_\@: + // Add 16 to get the correct number of data bytes remaining in 0...15 + // (not counting v7), following the previous extra subtraction by 16. + adds len, len, #16 + b.eq .Lreduce_final_16_bytes_\@ + +.Lhandle_partial_segment_\@: + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first + // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To + // do this without needing a fold constant for each possible 'len', + // redivide the bytes into a first chunk of 'len' bytes and a second + // chunk of 16 bytes, then fold the first chunk into the second. + + // v0 = last 16 original data bytes + add buf, buf, len + ldr q0, [buf, #-16] +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + + // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. + adr_l x4, .Lbyteshift_table + 16 + sub x4, x4, len + ld1 {v2.16b}, [x4] + tbl v1.16b, {v7.16b}, v2.16b + + // v3 = first chunk: v7 right-shifted by '16-len' bytes. + movi v3.16b, #0x80 + eor v2.16b, v2.16b, v3.16b + tbl v3.16b, {v7.16b}, v2.16b + + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. + sshr v2.16b, v2.16b, #7 + + // v2 = second chunk: 'len' bytes from v0 (low-order bytes), + // then '16-len' bytes from v1 (high-order bytes). + bsl v2.16b, v1.16b, v0.16b + + // Fold the first chunk into the second chunk, storing the result in v7. + __pmull_\p v0, v3, fold_consts + __pmull_\p v7, v3, fold_consts, 2 + eor v7.16b, v7.16b, v0.16b + eor v7.16b, v7.16b, v2.16b + +.Lreduce_final_16_bytes_\@: + // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. + + movi v2.16b, #0 // init zero register + + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + __pmull_pre_\p fold_consts + + // Fold the high 64 bits into the low 64 bits, while also multiplying by + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and + // whose low 48 bits are 0. + ext v0.16b, v2.16b, v7.16b, #8 + __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x)) + eor v0.16b, v0.16b, v7.16b // + low bits * x^64 + + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit + // value congruent to x^64 * M(x) and whose low 48 bits are 0. + ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits + mov v0.s[3], v2.s[0] // zero high 32 bits + __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x)) + eor v0.16b, v0.16b, v1.16b // + low bits + + // Load G(x) and floor(x^48 / G(x)). + ld1 {fold_consts.2d}, [fold_consts_ptr] + __pmull_pre_\p fold_consts + + // Use Barrett reduction to compute the final CRC value. + __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x)) + ushr v1.2d, v1.2d, #32 // /= x^32 + __pmull_\p v1, v1, fold_consts // *= G(x) + ushr v0.2d, v0.2d, #48 + eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. + + umov w0, v0.h[0] + .ifc \p, p8 + ldp x29, x30, [sp], #16 + .endif + ret + +.Lless_than_256_bytes_\@: + // Checksumming a buffer of length 16...255 bytes + + adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts + + // Load the first 16 data bytes. + ldr q7, [buf], #0x10 +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + + // XOR the first 16 data *bits* with the initial CRC value. + movi v0.16b, #0 + mov v0.h[7], init_crc + eor v7.16b, v7.16b, v0.16b + + // Load the fold-across-16-bytes constants. + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + __pmull_pre_\p fold_consts + + cmp len, #16 + b.eq .Lreduce_final_16_bytes_\@ // len == 16 + subs len, len, #32 + b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 + add len, len, #16 + b .Lhandle_partial_segment_\@ // 17 <= len <= 31 + .endm + +// +// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +SYM_FUNC_START(crc_t10dif_pmull_p8) + stp x29, x30, [sp, #-16]! + mov x29, sp + crc_t10dif_pmull p8 +SYM_FUNC_END(crc_t10dif_pmull_p8) + + .align 5 +// +// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +SYM_FUNC_START(crc_t10dif_pmull_p64) + crc_t10dif_pmull p64 +SYM_FUNC_END(crc_t10dif_pmull_p64) + + .section ".rodata", "a" + .align 4 + +// Fold constants precomputed from the polynomial 0x18bb7 +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +.Lfold_across_128_bytes_consts: + .quad 0x0000000000006123 // x^(8*128) mod G(x) + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) +// .Lfold_across_64_bytes_consts: + .quad 0x0000000000001069 // x^(4*128) mod G(x) + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) +// .Lfold_across_32_bytes_consts: + .quad 0x000000000000857d // x^(2*128) mod G(x) + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) +.Lfold_across_16_bytes_consts: + .quad 0x000000000000a010 // x^(1*128) mod G(x) + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) +// .Lfinal_fold_consts: + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) +// .Lbarrett_reduction_consts: + .quad 0x0000000000018bb7 // G(x) + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) + +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. +.Lbyteshift_table: + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c new file mode 100644 index 000000000..09eb1456a --- /dev/null +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/cpufeature.h> +#include <linux/crc-t10dif.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> + +#include <asm/neon.h> +#include <asm/simd.h> + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); +asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); + +static int crct10dif_init(struct shash_desc *desc) +{ + u16 *crc = shash_desc_ctx(desc); + + *crc = 0; + return 0; +} + +static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u16 *crc = shash_desc_ctx(desc); + + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { + do { + unsigned int chunk = length; + + if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) + chunk = SZ_4K; + + kernel_neon_begin(); + *crc = crc_t10dif_pmull_p8(*crc, data, chunk); + kernel_neon_end(); + data += chunk; + length -= chunk; + } while (length); + } else { + *crc = crc_t10dif_generic(*crc, data, length); + } + + return 0; +} + +static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u16 *crc = shash_desc_ctx(desc); + + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { + do { + unsigned int chunk = length; + + if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) + chunk = SZ_4K; + + kernel_neon_begin(); + *crc = crc_t10dif_pmull_p64(*crc, data, chunk); + kernel_neon_end(); + data += chunk; + length -= chunk; + } while (length); + } else { + *crc = crc_t10dif_generic(*crc, data, length); + } + + return 0; +} + +static int crct10dif_final(struct shash_desc *desc, u8 *out) +{ + u16 *crc = shash_desc_ctx(desc); + + *(u16 *)out = *crc; + return 0; +} + +static struct shash_alg crc_t10dif_alg[] = {{ + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = crct10dif_init, + .update = crct10dif_update_pmull_p8, + .final = crct10dif_final, + .descsize = CRC_T10DIF_DIGEST_SIZE, + + .base.cra_name = "crct10dif", + .base.cra_driver_name = "crct10dif-arm64-neon", + .base.cra_priority = 100, + .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = crct10dif_init, + .update = crct10dif_update_pmull_p64, + .final = crct10dif_final, + .descsize = CRC_T10DIF_DIGEST_SIZE, + + .base.cra_name = "crct10dif", + .base.cra_driver_name = "crct10dif-arm64-ce", + .base.cra_priority = 200, + .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}}; + +static int __init crc_t10dif_mod_init(void) +{ + if (cpu_have_named_feature(PMULL)) + return crypto_register_shashes(crc_t10dif_alg, + ARRAY_SIZE(crc_t10dif_alg)); + else + /* only register the first array element */ + return crypto_register_shash(crc_t10dif_alg); +} + +static void __exit crc_t10dif_mod_exit(void) +{ + if (cpu_have_named_feature(PMULL)) + crypto_unregister_shashes(crc_t10dif_alg, + ARRAY_SIZE(crc_t10dif_alg)); + else + crypto_unregister_shash(crc_t10dif_alg); +} + +module_cpu_feature_match(ASIMD, crc_t10dif_mod_init); +module_exit(crc_t10dif_mod_exit); + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("crct10dif"); +MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce"); diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S new file mode 100644 index 000000000..ebe555892 --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -0,0 +1,778 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Accelerated GHASH implementation with ARMv8 PMULL instructions. + * + * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <asm/assembler.h> + + SHASH .req v0 + SHASH2 .req v1 + T1 .req v2 + T2 .req v3 + MASK .req v4 + XM .req v5 + XL .req v6 + XH .req v7 + IN1 .req v7 + + k00_16 .req v8 + k32_48 .req v9 + + t3 .req v10 + t4 .req v11 + t5 .req v12 + t6 .req v13 + t7 .req v14 + t8 .req v15 + t9 .req v16 + + perm1 .req v17 + perm2 .req v18 + perm3 .req v19 + + sh1 .req v20 + sh2 .req v21 + sh3 .req v22 + sh4 .req v23 + + ss1 .req v24 + ss2 .req v25 + ss3 .req v26 + ss4 .req v27 + + XL2 .req v8 + XM2 .req v9 + XH2 .req v10 + XL3 .req v11 + XM3 .req v12 + XH3 .req v13 + TT3 .req v14 + TT4 .req v15 + HH .req v16 + HH3 .req v17 + HH4 .req v18 + HH34 .req v19 + + .text + .arch armv8-a+crypto + + .macro __pmull_p64, rd, rn, rm + pmull \rd\().1q, \rn\().1d, \rm\().1d + .endm + + .macro __pmull2_p64, rd, rn, rm + pmull2 \rd\().1q, \rn\().2d, \rm\().2d + .endm + + .macro __pmull_p8, rq, ad, bd + ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 + ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 + ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 + + __pmull_p8_\bd \rq, \ad + .endm + + .macro __pmull2_p8, rq, ad, bd + tbl t3.16b, {\ad\().16b}, perm1.16b // A1 + tbl t5.16b, {\ad\().16b}, perm2.16b // A2 + tbl t7.16b, {\ad\().16b}, perm3.16b // A3 + + __pmull2_p8_\bd \rq, \ad + .endm + + .macro __pmull_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_SHASH2, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 + .endm + + .macro __pmull2_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 + pmull\t t3.8h, t3.\nb, \bd // F = A1*B + pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 + pmull\t t5.8h, t5.\nb, \bd // H = A2*B + pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 + pmull\t t7.8h, t7.\nb, \bd // J = A3*B + pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 + pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 + pmull\t \rq\().8h, \ad, \bd // D = A*B + + eor t3.16b, t3.16b, t4.16b // L = E + F + eor t5.16b, t5.16b, t6.16b // M = G + H + eor t7.16b, t7.16b, t8.16b // N = I + J + + uzp1 t4.2d, t3.2d, t5.2d + uzp2 t3.2d, t3.2d, t5.2d + uzp1 t6.2d, t7.2d, t9.2d + uzp2 t7.2d, t7.2d, t9.2d + + // t3 = (L) (P0 + P1) << 8 + // t5 = (M) (P2 + P3) << 16 + eor t4.16b, t4.16b, t3.16b + and t3.16b, t3.16b, k32_48.16b + + // t7 = (N) (P4 + P5) << 24 + // t9 = (K) (P6 + P7) << 32 + eor t6.16b, t6.16b, t7.16b + and t7.16b, t7.16b, k00_16.16b + + eor t4.16b, t4.16b, t3.16b + eor t6.16b, t6.16b, t7.16b + + zip2 t5.2d, t4.2d, t3.2d + zip1 t3.2d, t4.2d, t3.2d + zip2 t9.2d, t6.2d, t7.2d + zip1 t7.2d, t6.2d, t7.2d + + ext t3.16b, t3.16b, t3.16b, #15 + ext t5.16b, t5.16b, t5.16b, #14 + ext t7.16b, t7.16b, t7.16b, #13 + ext t9.16b, t9.16b, t9.16b, #12 + + eor t3.16b, t3.16b, t5.16b + eor t7.16b, t7.16b, t9.16b + eor \rq\().16b, \rq\().16b, t3.16b + eor \rq\().16b, \rq\().16b, t7.16b + .endm + + .macro __pmull_pre_p64 + add x8, x3, #16 + ld1 {HH.2d-HH4.2d}, [x8] + + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d + eor SHASH2.16b, SHASH2.16b, T1.16b + + trn1 HH34.2d, HH3.2d, HH4.2d + trn2 T1.2d, HH3.2d, HH4.2d + eor HH34.16b, HH34.16b, T1.16b + + movi MASK.16b, #0xe1 + shl MASK.2d, MASK.2d, #57 + .endm + + .macro __pmull_pre_p8 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + eor SHASH2.16b, SHASH2.16b, SHASH.16b + + // k00_16 := 0x0000000000000000_000000000000ffff + // k32_48 := 0x00000000ffffffff_0000ffffffffffff + movi k32_48.2d, #0xffffffff + mov k32_48.h[2], k32_48.h[0] + ushr k00_16.2d, k32_48.2d, #32 + + // prepare the permutation vectors + mov_q x5, 0x080f0e0d0c0b0a09 + movi T1.8b, #8 + dup perm1.2d, x5 + eor perm1.16b, perm1.16b, T1.16b + ushr perm2.2d, perm1.2d, #8 + ushr perm3.2d, perm1.2d, #16 + ushr T1.2d, perm1.2d, #24 + sli perm2.2d, perm1.2d, #56 + sli perm3.2d, perm1.2d, #48 + sli T1.2d, perm1.2d, #40 + + // precompute loop invariants + tbl sh1.16b, {SHASH.16b}, perm1.16b + tbl sh2.16b, {SHASH.16b}, perm2.16b + tbl sh3.16b, {SHASH.16b}, perm3.16b + tbl sh4.16b, {SHASH.16b}, T1.16b + ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 + ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 + ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 + ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 + .endm + + // + // PMULL (64x64->128) based reduction for CPUs that can do + // it in a single instruction. + // + .macro __pmull_reduce_p64 + pmull T2.1q, XL.1d, MASK.1d + eor XM.16b, XM.16b, T1.16b + + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + eor XL.16b, XM.16b, T2.16b + ext T2.16b, XL.16b, XL.16b, #8 + pmull XL.1q, XL.1d, MASK.1d + .endm + + // + // Alternative reduction for CPUs that lack support for the + // 64x64->128 PMULL instruction + // + .macro __pmull_reduce_p8 + eor XM.16b, XM.16b, T1.16b + + mov XL.d[1], XM.d[0] + mov XH.d[0], XM.d[1] + + shl T1.2d, XL.2d, #57 + shl T2.2d, XL.2d, #62 + eor T2.16b, T2.16b, T1.16b + shl T1.2d, XL.2d, #63 + eor T2.16b, T2.16b, T1.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, T2.16b, T1.16b + + mov XL.d[1], T2.d[0] + mov XH.d[0], T2.d[1] + + ushr T2.2d, XL.2d, #1 + eor XH.16b, XH.16b, XL.16b + eor XL.16b, XL.16b, T2.16b + ushr T2.2d, T2.2d, #6 + ushr XL.2d, XL.2d, #1 + .endm + + .macro __pmull_ghash, pn + ld1 {SHASH.2d}, [x3] + ld1 {XL.2d}, [x1] + + __pmull_pre_\pn + + /* do the head block first, if supplied */ + cbz x4, 0f + ld1 {T1.2d}, [x4] + mov x4, xzr + b 3f + +0: .ifc \pn, p64 + tbnz w0, #0, 2f // skip until #blocks is a + tbnz w0, #1, 2f // round multiple of 4 + +1: ld1 {XM3.16b-TT4.16b}, [x2], #64 + + sub w0, w0, #4 + + rev64 T1.16b, XM3.16b + rev64 T2.16b, XH3.16b + rev64 TT4.16b, TT4.16b + rev64 TT3.16b, TT3.16b + + ext IN1.16b, TT4.16b, TT4.16b, #8 + ext XL3.16b, TT3.16b, TT3.16b, #8 + + eor TT4.16b, TT4.16b, IN1.16b + pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 + pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 + pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) + + eor TT3.16b, TT3.16b, XL3.16b + pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 + pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 + pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) + + ext IN1.16b, T2.16b, T2.16b, #8 + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b + + eor T2.16b, T2.16b, IN1.16b + pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 + pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 + pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) + + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b + + ext IN1.16b, T1.16b, T1.16b, #8 + ext TT3.16b, XL.16b, XL.16b, #8 + eor XL.16b, XL.16b, IN1.16b + eor T1.16b, T1.16b, TT3.16b + + pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 + eor T1.16b, T1.16b, XL.16b + pmull XL.1q, HH4.1d, XL.1d // a0 * b0 + pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) + + eor XL.16b, XL.16b, XL2.16b + eor XH.16b, XH.16b, XH2.16b + eor XM.16b, XM.16b, XM2.16b + + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_p64 + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbz w0, 5f + b 1b + .endif + +2: ld1 {T1.2d}, [x2], #16 + sub w0, w0, #1 + +3: /* multiply XL by SHASH in GF(2^128) */ +CPU_LE( rev64 T1.16b, T1.16b ) + + ext T2.16b, XL.16b, XL.16b, #8 + ext IN1.16b, T1.16b, T1.16b, #8 + eor T1.16b, T1.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b + + __pmull2_\pn XH, XL, SHASH // a1 * b1 + eor T1.16b, T1.16b, XL.16b + __pmull_\pn XL, XL, SHASH // a0 * b0 + __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) + +4: eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_\pn + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbnz w0, 0b + +5: st1 {XL.2d}, [x1] + ret + .endm + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +SYM_TYPED_FUNC_START(pmull_ghash_update_p64) + __pmull_ghash p64 +SYM_FUNC_END(pmull_ghash_update_p64) + +SYM_TYPED_FUNC_START(pmull_ghash_update_p8) + __pmull_ghash p8 +SYM_FUNC_END(pmull_ghash_update_p8) + + KS0 .req v8 + KS1 .req v9 + KS2 .req v10 + KS3 .req v11 + + INP0 .req v21 + INP1 .req v22 + INP2 .req v23 + INP3 .req v24 + + K0 .req v25 + K1 .req v26 + K2 .req v27 + K3 .req v28 + K4 .req v12 + K5 .req v13 + K6 .req v4 + K7 .req v5 + K8 .req v14 + K9 .req v15 + KK .req v29 + KL .req v30 + KM .req v31 + + .macro load_round_keys, rounds, rk, tmp + add \tmp, \rk, #64 + ld1 {K0.4s-K3.4s}, [\rk] + ld1 {K4.4s-K5.4s}, [\tmp] + add \tmp, \rk, \rounds, lsl #4 + sub \tmp, \tmp, #32 + ld1 {KK.4s-KM.4s}, [\tmp] + .endm + + .macro enc_round, state, key + aese \state\().16b, \key\().16b + aesmc \state\().16b, \state\().16b + .endm + + .macro enc_qround, s0, s1, s2, s3, key + enc_round \s0, \key + enc_round \s1, \key + enc_round \s2, \key + enc_round \s3, \key + .endm + + .macro enc_block, state, rounds, rk, tmp + add \tmp, \rk, #96 + ld1 {K6.4s-K7.4s}, [\tmp], #32 + .irp key, K0, K1, K2, K3, K4 K5 + enc_round \state, \key + .endr + + tbnz \rounds, #2, .Lnot128_\@ +.Lout256_\@: + enc_round \state, K6 + enc_round \state, K7 + +.Lout192_\@: + enc_round \state, KK + aese \state\().16b, KL.16b + eor \state\().16b, \state\().16b, KM.16b + + .subsection 1 +.Lnot128_\@: + ld1 {K8.4s-K9.4s}, [\tmp], #32 + enc_round \state, K6 + enc_round \state, K7 + ld1 {K6.4s-K7.4s}, [\tmp] + enc_round \state, K8 + enc_round \state, K9 + tbz \rounds, #1, .Lout192_\@ + b .Lout256_\@ + .previous + .endm + + .align 6 + .macro pmull_gcm_do_crypt, enc + stp x29, x30, [sp, #-32]! + mov x29, sp + str x19, [sp, #24] + + load_round_keys x7, x6, x8 + + ld1 {SHASH.2d}, [x3], #16 + ld1 {HH.2d-HH4.2d}, [x3] + + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d + eor SHASH2.16b, SHASH2.16b, T1.16b + + trn1 HH34.2d, HH3.2d, HH4.2d + trn2 T1.2d, HH3.2d, HH4.2d + eor HH34.16b, HH34.16b, T1.16b + + ld1 {XL.2d}, [x4] + + cbz x0, 3f // tag only? + + ldr w8, [x5, #12] // load lower counter +CPU_LE( rev w8, w8 ) + +0: mov w9, #4 // max blocks per round + add x10, x0, #0xf + lsr x10, x10, #4 // remaining blocks + + subs x0, x0, #64 + csel w9, w10, w9, mi + add w8, w8, w9 + + bmi 1f + ld1 {INP0.16b-INP3.16b}, [x2], #64 + .subsection 1 + /* + * Populate the four input registers right to left with up to 63 bytes + * of data, using overlapping loads to avoid branches. + * + * INP0 INP1 INP2 INP3 + * 1 byte | | | |x | + * 16 bytes | | | |xxxxxxxx| + * 17 bytes | | |xxxxxxxx|x | + * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | + * etc etc + * + * Note that this code may read up to 15 bytes before the start of + * the input. It is up to the calling code to ensure this is safe if + * this happens in the first iteration of the loop (i.e., when the + * input size is < 16 bytes) + */ +1: mov x15, #16 + ands x19, x0, #0xf + csel x19, x19, x15, ne + adr_l x17, .Lpermute_table + 16 + + sub x11, x15, x19 + add x12, x17, x11 + sub x17, x17, x11 + ld1 {T1.16b}, [x12] + sub x10, x1, x11 + sub x11, x2, x11 + + cmp x0, #-16 + csel x14, x15, xzr, gt + cmp x0, #-32 + csel x15, x15, xzr, gt + cmp x0, #-48 + csel x16, x19, xzr, gt + csel x1, x1, x10, gt + csel x2, x2, x11, gt + + ld1 {INP0.16b}, [x2], x14 + ld1 {INP1.16b}, [x2], x15 + ld1 {INP2.16b}, [x2], x16 + ld1 {INP3.16b}, [x2] + tbl INP3.16b, {INP3.16b}, T1.16b + b 2f + .previous + +2: .if \enc == 0 + bl pmull_gcm_ghash_4x + .endif + + bl pmull_gcm_enc_4x + + tbnz x0, #63, 6f + st1 {INP0.16b-INP3.16b}, [x1], #64 + .if \enc == 1 + bl pmull_gcm_ghash_4x + .endif + bne 0b + +3: ldp x19, x10, [sp, #24] + cbz x10, 5f // output tag? + + ld1 {INP3.16b}, [x10] // load lengths[] + mov w9, #1 + bl pmull_gcm_ghash_4x + + mov w11, #(0x1 << 24) // BE '1U' + ld1 {KS0.16b}, [x5] + mov KS0.s[3], w11 + + enc_block KS0, x7, x6, x12 + + ext XL.16b, XL.16b, XL.16b, #8 + rev64 XL.16b, XL.16b + eor XL.16b, XL.16b, KS0.16b + + .if \enc == 1 + st1 {XL.16b}, [x10] // store tag + .else + ldp x11, x12, [sp, #40] // load tag pointer and authsize + adr_l x17, .Lpermute_table + ld1 {KS0.16b}, [x11] // load supplied tag + add x17, x17, x12 + ld1 {KS1.16b}, [x17] // load permute vector + + cmeq XL.16b, XL.16b, KS0.16b // compare tags + mvn XL.16b, XL.16b // -1 for fail, 0 for pass + tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only + sminv b0, XL.16b // signed minimum across XL + smov w0, v0.b[0] // return b0 + .endif + +4: ldp x29, x30, [sp], #32 + ret + +5: +CPU_LE( rev w8, w8 ) + str w8, [x5, #12] // store lower counter + st1 {XL.2d}, [x4] + b 4b + +6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors + sub x17, x17, x19, lsl #1 + + cmp w9, #1 + beq 7f + .subsection 1 +7: ld1 {INP2.16b}, [x1] + tbx INP2.16b, {INP3.16b}, T1.16b + mov INP3.16b, INP2.16b + b 8f + .previous + + st1 {INP0.16b}, [x1], x14 + st1 {INP1.16b}, [x1], x15 + st1 {INP2.16b}, [x1], x16 + tbl INP3.16b, {INP3.16b}, T1.16b + tbx INP3.16b, {INP2.16b}, T2.16b +8: st1 {INP3.16b}, [x1] + + .if \enc == 1 + ld1 {T1.16b}, [x17] + tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits + bl pmull_gcm_ghash_4x + .endif + b 3b + .endm + + /* + * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[], + * struct ghash_key const *k, u64 dg[], u8 ctr[], + * int rounds, u8 tag) + */ +SYM_FUNC_START(pmull_gcm_encrypt) + pmull_gcm_do_crypt 1 +SYM_FUNC_END(pmull_gcm_encrypt) + + /* + * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[], + * struct ghash_key const *k, u64 dg[], u8 ctr[], + * int rounds, u8 tag) + */ +SYM_FUNC_START(pmull_gcm_decrypt) + pmull_gcm_do_crypt 0 +SYM_FUNC_END(pmull_gcm_decrypt) + +SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x) + movi MASK.16b, #0xe1 + shl MASK.2d, MASK.2d, #57 + + rev64 T1.16b, INP0.16b + rev64 T2.16b, INP1.16b + rev64 TT3.16b, INP2.16b + rev64 TT4.16b, INP3.16b + + ext XL.16b, XL.16b, XL.16b, #8 + + tbz w9, #2, 0f // <4 blocks? + .subsection 1 +0: movi XH2.16b, #0 + movi XM2.16b, #0 + movi XL2.16b, #0 + + tbz w9, #0, 1f // 2 blocks? + tbz w9, #1, 2f // 1 block? + + eor T2.16b, T2.16b, XL.16b + ext T1.16b, T2.16b, T2.16b, #8 + b .Lgh3 + +1: eor TT3.16b, TT3.16b, XL.16b + ext T2.16b, TT3.16b, TT3.16b, #8 + b .Lgh2 + +2: eor TT4.16b, TT4.16b, XL.16b + ext IN1.16b, TT4.16b, TT4.16b, #8 + b .Lgh1 + .previous + + eor T1.16b, T1.16b, XL.16b + ext IN1.16b, T1.16b, T1.16b, #8 + + pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 + eor T1.16b, T1.16b, IN1.16b + pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 + pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) + + ext T1.16b, T2.16b, T2.16b, #8 +.Lgh3: eor T2.16b, T2.16b, T1.16b + pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 + pmull XL.1q, HH3.1d, T1.1d // a0 * b0 + pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) + + eor XH2.16b, XH2.16b, XH.16b + eor XL2.16b, XL2.16b, XL.16b + eor XM2.16b, XM2.16b, XM.16b + + ext T2.16b, TT3.16b, TT3.16b, #8 +.Lgh2: eor TT3.16b, TT3.16b, T2.16b + pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 + pmull XL.1q, HH.1d, T2.1d // a0 * b0 + pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) + + eor XH2.16b, XH2.16b, XH.16b + eor XL2.16b, XL2.16b, XL.16b + eor XM2.16b, XM2.16b, XM.16b + + ext IN1.16b, TT4.16b, TT4.16b, #8 +.Lgh1: eor TT4.16b, TT4.16b, IN1.16b + pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 + pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 + pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) + + eor XH.16b, XH.16b, XH2.16b + eor XL.16b, XL.16b, XL2.16b + eor XM.16b, XM.16b, XM2.16b + + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_p64 + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + ret +SYM_FUNC_END(pmull_gcm_ghash_4x) + +SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x) + ld1 {KS0.16b}, [x5] // load upper counter + sub w10, w8, #4 + sub w11, w8, #3 + sub w12, w8, #2 + sub w13, w8, #1 + rev w10, w10 + rev w11, w11 + rev w12, w12 + rev w13, w13 + mov KS1.16b, KS0.16b + mov KS2.16b, KS0.16b + mov KS3.16b, KS0.16b + ins KS0.s[3], w10 // set lower counter + ins KS1.s[3], w11 + ins KS2.s[3], w12 + ins KS3.s[3], w13 + + add x10, x6, #96 // round key pointer + ld1 {K6.4s-K7.4s}, [x10], #32 + .irp key, K0, K1, K2, K3, K4, K5 + enc_qround KS0, KS1, KS2, KS3, \key + .endr + + tbnz x7, #2, .Lnot128 + .subsection 1 +.Lnot128: + ld1 {K8.4s-K9.4s}, [x10], #32 + .irp key, K6, K7 + enc_qround KS0, KS1, KS2, KS3, \key + .endr + ld1 {K6.4s-K7.4s}, [x10] + .irp key, K8, K9 + enc_qround KS0, KS1, KS2, KS3, \key + .endr + tbz x7, #1, .Lout192 + b .Lout256 + .previous + +.Lout256: + .irp key, K6, K7 + enc_qround KS0, KS1, KS2, KS3, \key + .endr + +.Lout192: + enc_qround KS0, KS1, KS2, KS3, KK + + aese KS0.16b, KL.16b + aese KS1.16b, KL.16b + aese KS2.16b, KL.16b + aese KS3.16b, KL.16b + + eor KS0.16b, KS0.16b, KM.16b + eor KS1.16b, KS1.16b, KM.16b + eor KS2.16b, KS2.16b, KM.16b + eor KS3.16b, KS3.16b, KM.16b + + eor INP0.16b, INP0.16b, KS0.16b + eor INP1.16b, INP1.16b, KS1.16b + eor INP2.16b, INP2.16b, KS2.16b + eor INP3.16b, INP3.16b, KS3.16b + + ret +SYM_FUNC_END(pmull_gcm_enc_4x) + + .section ".rodata", "a" + .align 6 +.Lpermute_table: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .previous diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c new file mode 100644 index 000000000..15794fe21 --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated GHASH implementation with ARMv8 PMULL instructions. + * + * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/aes.h> +#include <crypto/algapi.h> +#include <crypto/b128ops.h> +#include <crypto/gf128mul.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("ghash"); + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 +#define GCM_IV_SIZE 12 + +struct ghash_key { + be128 k; + u64 h[][2]; +}; + +struct ghash_desc_ctx { + u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; + u8 buf[GHASH_BLOCK_SIZE]; + u32 count; +}; + +struct gcm_aes_ctx { + struct crypto_aes_ctx aes_key; + struct ghash_key ghash_key; +}; + +asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, + u64 const h[][2], const char *head); + +asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, + u64 const h[][2], const char *head); + +asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[], + u64 const h[][2], u64 dg[], u8 ctr[], + u32 const rk[], int rounds, u8 tag[]); +asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[], + u64 const h[][2], u64 dg[], u8 ctr[], + u32 const rk[], int rounds, const u8 l[], + const u8 tag[], u64 authsize); + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + + *ctx = (struct ghash_desc_ctx){}; + return 0; +} + +static void ghash_do_update(int blocks, u64 dg[], const char *src, + struct ghash_key *key, const char *head) +{ + be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) }; + + do { + const u8 *in = src; + + if (head) { + in = head; + blocks++; + head = NULL; + } else { + src += GHASH_BLOCK_SIZE; + } + + crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE); + gf128mul_lle(&dst, &key->k); + } while (--blocks); + + dg[0] = be64_to_cpu(dst.b); + dg[1] = be64_to_cpu(dst.a); +} + +static __always_inline +void ghash_do_simd_update(int blocks, u64 dg[], const char *src, + struct ghash_key *key, const char *head, + void (*simd_update)(int blocks, u64 dg[], + const char *src, + u64 const h[][2], + const char *head)) +{ + if (likely(crypto_simd_usable())) { + kernel_neon_begin(); + simd_update(blocks, dg, src, key->h, head); + kernel_neon_end(); + } else { + ghash_do_update(blocks, dg, src, key, head); + } +} + +/* avoid hogging the CPU for too long */ +#define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE) + +static int ghash_update(struct shash_desc *desc, const u8 *src, + unsigned int len) +{ + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + + ctx->count += len; + + if ((partial + len) >= GHASH_BLOCK_SIZE) { + struct ghash_key *key = crypto_shash_ctx(desc->tfm); + int blocks; + + if (partial) { + int p = GHASH_BLOCK_SIZE - partial; + + memcpy(ctx->buf + partial, src, p); + src += p; + len -= p; + } + + blocks = len / GHASH_BLOCK_SIZE; + len %= GHASH_BLOCK_SIZE; + + do { + int chunk = min(blocks, MAX_BLOCKS); + + ghash_do_simd_update(chunk, ctx->digest, src, key, + partial ? ctx->buf : NULL, + pmull_ghash_update_p8); + + blocks -= chunk; + src += chunk * GHASH_BLOCK_SIZE; + partial = 0; + } while (unlikely(blocks > 0)); + } + if (len) + memcpy(ctx->buf + partial, src, len); + return 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + + if (partial) { + struct ghash_key *key = crypto_shash_ctx(desc->tfm); + + memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); + + ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL, + pmull_ghash_update_p8); + } + put_unaligned_be64(ctx->digest[1], dst); + put_unaligned_be64(ctx->digest[0], dst + 8); + + memzero_explicit(ctx, sizeof(*ctx)); + return 0; +} + +static void ghash_reflect(u64 h[], const be128 *k) +{ + u64 carry = be64_to_cpu(k->a) & BIT(63) ? 1 : 0; + + h[0] = (be64_to_cpu(k->b) << 1) | carry; + h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63); + + if (carry) + h[1] ^= 0xc200000000000000UL; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *inkey, unsigned int keylen) +{ + struct ghash_key *key = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) + return -EINVAL; + + /* needed for the fallback */ + memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); + + ghash_reflect(key->h[0], &key->k); + return 0; +} + +static struct shash_alg ghash_alg = { + .base.cra_name = "ghash", + .base.cra_driver_name = "ghash-neon", + .base.cra_priority = 150, + .base.cra_blocksize = GHASH_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]), + .base.cra_module = THIS_MODULE, + + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), +}; + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ + /* + * # of rounds specified by AES: + * 128 bit key 10 rounds + * 192 bit key 12 rounds + * 256 bit key 14 rounds + * => n byte key => 6 + (n/4) rounds + */ + return 6 + ctx->key_length / 4; +} + +static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, + unsigned int keylen) +{ + struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); + u8 key[GHASH_BLOCK_SIZE]; + be128 h; + int ret; + + ret = aes_expandkey(&ctx->aes_key, inkey, keylen); + if (ret) + return -EINVAL; + + aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){}); + + /* needed for the fallback */ + memcpy(&ctx->ghash_key.k, key, GHASH_BLOCK_SIZE); + + ghash_reflect(ctx->ghash_key.h[0], &ctx->ghash_key.k); + + h = ctx->ghash_key.k; + gf128mul_lle(&h, &ctx->ghash_key.k); + ghash_reflect(ctx->ghash_key.h[1], &h); + + gf128mul_lle(&h, &ctx->ghash_key.k); + ghash_reflect(ctx->ghash_key.h[2], &h); + + gf128mul_lle(&h, &ctx->ghash_key.k); + ghash_reflect(ctx->ghash_key.h[3], &h); + + return 0; +} + +static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12 ... 16: + break; + default: + return -EINVAL; + } + return 0; +} + +static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[], + int *buf_count, struct gcm_aes_ctx *ctx) +{ + if (*buf_count > 0) { + int buf_added = min(count, GHASH_BLOCK_SIZE - *buf_count); + + memcpy(&buf[*buf_count], src, buf_added); + + *buf_count += buf_added; + src += buf_added; + count -= buf_added; + } + + if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) { + int blocks = count / GHASH_BLOCK_SIZE; + + ghash_do_simd_update(blocks, dg, src, &ctx->ghash_key, + *buf_count ? buf : NULL, + pmull_ghash_update_p64); + + src += blocks * GHASH_BLOCK_SIZE; + count %= GHASH_BLOCK_SIZE; + *buf_count = 0; + } + + if (count > 0) { + memcpy(buf, src, count); + *buf_count = count; + } +} + +static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[]) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + u8 buf[GHASH_BLOCK_SIZE]; + struct scatter_walk walk; + u32 len = req->assoclen; + int buf_count = 0; + + scatterwalk_start(&walk, req->src); + + do { + u32 n = scatterwalk_clamp(&walk, len); + u8 *p; + + if (!n) { + scatterwalk_start(&walk, sg_next(walk.sg)); + n = scatterwalk_clamp(&walk, len); + } + p = scatterwalk_map(&walk); + + gcm_update_mac(dg, p, n, buf, &buf_count, ctx); + len -= n; + + scatterwalk_unmap(p); + scatterwalk_advance(&walk, n); + scatterwalk_done(&walk, 0, len); + } while (len); + + if (buf_count) { + memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count); + ghash_do_simd_update(1, dg, buf, &ctx->ghash_key, NULL, + pmull_ghash_update_p64); + } +} + +static int gcm_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + int nrounds = num_rounds(&ctx->aes_key); + struct skcipher_walk walk; + u8 buf[AES_BLOCK_SIZE]; + u8 iv[AES_BLOCK_SIZE]; + u64 dg[2] = {}; + be128 lengths; + u8 *tag; + int err; + + lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.b = cpu_to_be64(req->cryptlen * 8); + + if (req->assoclen) + gcm_calculate_auth_mac(req, dg); + + memcpy(iv, req->iv, GCM_IV_SIZE); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_encrypt(&walk, req, false); + + do { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + int nbytes = walk.nbytes; + + tag = (u8 *)&lengths; + + if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) { + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + } else if (nbytes < walk.total) { + nbytes &= ~(AES_BLOCK_SIZE - 1); + tag = NULL; + } + + kernel_neon_begin(); + pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h, + dg, iv, ctx->aes_key.key_enc, nrounds, + tag); + kernel_neon_end(); + + if (unlikely(!nbytes)) + break; + + if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } while (walk.nbytes); + + if (err) + return err; + + /* copy authtag to end of dst */ + scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int gcm_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + unsigned int authsize = crypto_aead_authsize(aead); + int nrounds = num_rounds(&ctx->aes_key); + struct skcipher_walk walk; + u8 otag[AES_BLOCK_SIZE]; + u8 buf[AES_BLOCK_SIZE]; + u8 iv[AES_BLOCK_SIZE]; + u64 dg[2] = {}; + be128 lengths; + u8 *tag; + int ret; + int err; + + lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8); + + if (req->assoclen) + gcm_calculate_auth_mac(req, dg); + + memcpy(iv, req->iv, GCM_IV_SIZE); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + scatterwalk_map_and_copy(otag, req->src, + req->assoclen + req->cryptlen - authsize, + authsize, 0); + + err = skcipher_walk_aead_decrypt(&walk, req, false); + + do { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + int nbytes = walk.nbytes; + + tag = (u8 *)&lengths; + + if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) { + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + } else if (nbytes < walk.total) { + nbytes &= ~(AES_BLOCK_SIZE - 1); + tag = NULL; + } + + kernel_neon_begin(); + ret = pmull_gcm_decrypt(nbytes, dst, src, ctx->ghash_key.h, + dg, iv, ctx->aes_key.key_enc, + nrounds, tag, otag, authsize); + kernel_neon_end(); + + if (unlikely(!nbytes)) + break; + + if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) + memcpy(walk.dst.virt.addr, + buf + sizeof(buf) - nbytes, nbytes); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } while (walk.nbytes); + + if (err) + return err; + + return ret ? -EBADMSG : 0; +} + +static struct aead_alg gcm_aes_alg = { + .ivsize = GCM_IV_SIZE, + .chunksize = AES_BLOCK_SIZE, + .maxauthsize = AES_BLOCK_SIZE, + .setkey = gcm_setkey, + .setauthsize = gcm_setauthsize, + .encrypt = gcm_encrypt, + .decrypt = gcm_decrypt, + + .base.cra_name = "gcm(aes)", + .base.cra_driver_name = "gcm-aes-ce", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct gcm_aes_ctx) + + 4 * sizeof(u64[2]), + .base.cra_module = THIS_MODULE, +}; + +static int __init ghash_ce_mod_init(void) +{ + if (!cpu_have_named_feature(ASIMD)) + return -ENODEV; + + if (cpu_have_named_feature(PMULL)) + return crypto_register_aead(&gcm_aes_alg); + + return crypto_register_shash(&ghash_alg); +} + +static void __exit ghash_ce_mod_exit(void) +{ + if (cpu_have_named_feature(PMULL)) + crypto_unregister_aead(&gcm_aes_alg); + else + crypto_unregister_shash(&ghash_alg); +} + +static const struct cpu_feature ghash_cpu_feature[] = { + { cpu_feature(PMULL) }, { } +}; +MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature); + +module_init(ghash_ce_mod_init); +module_exit(ghash_ce_mod_exit); diff --git a/arch/arm64/crypto/nh-neon-core.S b/arch/arm64/crypto/nh-neon-core.S new file mode 100644 index 000000000..51c0a534e --- /dev/null +++ b/arch/arm64/crypto/nh-neon-core.S @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, ARM64 NEON accelerated version + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ + +#include <linux/linkage.h> + + KEY .req x0 + MESSAGE .req x1 + MESSAGE_LEN .req x2 + HASH .req x3 + + PASS0_SUMS .req v0 + PASS1_SUMS .req v1 + PASS2_SUMS .req v2 + PASS3_SUMS .req v3 + K0 .req v4 + K1 .req v5 + K2 .req v6 + K3 .req v7 + T0 .req v8 + T1 .req v9 + T2 .req v10 + T3 .req v11 + T4 .req v12 + T5 .req v13 + T6 .req v14 + T7 .req v15 + +.macro _nh_stride k0, k1, k2, k3 + + // Load next message stride + ld1 {T3.16b}, [MESSAGE], #16 + + // Load next key stride + ld1 {\k3\().4s}, [KEY], #16 + + // Add message words to key words + add T0.4s, T3.4s, \k0\().4s + add T1.4s, T3.4s, \k1\().4s + add T2.4s, T3.4s, \k2\().4s + add T3.4s, T3.4s, \k3\().4s + + // Multiply 32x32 => 64 and accumulate + mov T4.d[0], T0.d[1] + mov T5.d[0], T1.d[1] + mov T6.d[0], T2.d[1] + mov T7.d[0], T3.d[1] + umlal PASS0_SUMS.2d, T0.2s, T4.2s + umlal PASS1_SUMS.2d, T1.2s, T5.2s + umlal PASS2_SUMS.2d, T2.2s, T6.2s + umlal PASS3_SUMS.2d, T3.2s, T7.2s +.endm + +/* + * void nh_neon(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +SYM_FUNC_START(nh_neon) + + ld1 {K0.4s,K1.4s}, [KEY], #32 + movi PASS0_SUMS.2d, #0 + movi PASS1_SUMS.2d, #0 + ld1 {K2.4s}, [KEY], #16 + movi PASS2_SUMS.2d, #0 + movi PASS3_SUMS.2d, #0 + + subs MESSAGE_LEN, MESSAGE_LEN, #64 + blt .Lloop4_done +.Lloop4: + _nh_stride K0, K1, K2, K3 + _nh_stride K1, K2, K3, K0 + _nh_stride K2, K3, K0, K1 + _nh_stride K3, K0, K1, K2 + subs MESSAGE_LEN, MESSAGE_LEN, #64 + bge .Lloop4 + +.Lloop4_done: + ands MESSAGE_LEN, MESSAGE_LEN, #63 + beq .Ldone + _nh_stride K0, K1, K2, K3 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K1, K2, K3, K0 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K2, K3, K0, K1 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d + addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d + st1 {T0.16b,T1.16b}, [HASH] + ret +SYM_FUNC_END(nh_neon) diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c new file mode 100644 index 000000000..c5405e6a6 --- /dev/null +++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * (ARM64 NEON accelerated version) + * + * Copyright 2018 Google LLC + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/nhpoly1305.h> +#include <linux/module.h> + +asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len, + u8 hash[NH_HASH_BYTES]); + +/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ +static void _nh_neon(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + nh_neon(key, message, message_len, (u8 *)hash); +} + +static int nhpoly1305_neon_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + if (srclen < 64 || !crypto_simd_usable()) + return crypto_nhpoly1305_update(desc, src, srclen); + + do { + unsigned int n = min_t(unsigned int, srclen, SZ_4K); + + kernel_neon_begin(); + crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); + kernel_neon_end(); + src += n; + srclen -= n; + } while (srclen); + return 0; +} + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-neon", + .base.cra_priority = 200, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = nhpoly1305_neon_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + if (!cpu_have_named_feature(ASIMD)) + return -ENODEV; + + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>"); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-neon"); diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl new file mode 100644 index 000000000..cbc980fb0 --- /dev/null +++ b/arch/arm64/crypto/poly1305-armv8.pl @@ -0,0 +1,913 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# This module implements Poly1305 hash for ARMv8. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc-4.9 NEON +# +# Apple A7 1.86/+5% 0.72 +# Cortex-A53 2.69/+58% 1.47 +# Cortex-A57 2.70/+7% 1.14 +# Denver 1.64/+50% 1.18(*) +# X-Gene 2.13/+68% 2.27 +# Mongoose 1.77/+75% 1.12 +# Kryo 2.70/+55% 1.13 +# ThunderX2 1.17/+95% 1.36 +# +# (*) estimate based on resources availability is less than 1.0, +# i.e. measured result is worse than expected, presumably binary +# translator is not almighty; + +$flavour=shift; +$output=shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armcap_P +#endif + +.text + +// forward "declarations" are required for Apple +.globl poly1305_blocks +.globl poly1305_emit + +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: + cmp $inp,xzr + stp xzr,xzr,[$ctx] // zero hash value + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + +#ifndef __KERNEL__ + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] +#endif + + ldp $r0,$r1,[$inp] // load key + mov $s1,#0xfffffffc0fffffff + movk $s1,#0x0fff,lsl#48 +#ifdef __AARCH64EB__ + rev $r0,$r0 // flip bytes + rev $r1,$r1 +#endif + and $r0,$r0,$s1 // &=0ffffffc0fffffff + and $s1,$s1,#-4 + and $r1,$r1,$s1 // &=0ffffffc0ffffffc + mov w#$s1,#-1 + stp $r0,$r1,[$ctx,#32] // save key value + str w#$s1,[$ctx,#48] // impossible key power value + +#ifndef __KERNEL__ + tst w17,#ARMV7_NEON + + adr $d0,.Lpoly1305_blocks + adr $r0,.Lpoly1305_blocks_neon + adr $d1,.Lpoly1305_emit + + csel $d0,$d0,$r0,eq + +# ifdef __ILP32__ + stp w#$d0,w#$d1,[$len] +# else + stp $d0,$d1,[$len] +# endif +#endif + mov x0,#1 +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + ands $len,$len,#-16 + b.eq .Lno_data + + ldp $h0,$h1,[$ctx] // load hash value + ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] + ldp $r0,$r1,[$ctx,#32] // load key value + +#ifdef __AARCH64EB__ + lsr $d0,$h0,#32 + mov w#$d1,w#$h0 + lsr $d2,$h1,#32 + mov w15,w#$h1 + lsr x16,$h2,#32 +#else + mov w#$d0,w#$h0 + lsr $d1,$h0,#32 + mov w#$d2,w#$h1 + lsr x15,$h1,#32 + mov w16,w#$h2 +#endif + + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 + lsr $d1,$d2,#12 + adds $d0,$d0,$d2,lsl#52 + add $d1,$d1,x15,lsl#14 + adc $d1,$d1,xzr + lsr $d2,x16,#24 + adds $d1,$d1,x16,lsl#40 + adc $d2,$d2,xzr + + cmp x17,#0 // is_base2_26? + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + csel $h0,$h0,$d0,eq // choose between radixes + csel $h1,$h1,$d1,eq + csel $h2,$h2,$d2,eq + +.Loop: + ldp $t0,$t1,[$inp],#16 // load input + sub $len,$len,#16 +#ifdef __AARCH64EB__ + rev $t0,$t0 + rev $t1,$t1 +#endif + adds $h0,$h0,$t0 // accumulate input + adcs $h1,$h1,$t1 + + mul $d0,$h0,$r0 // h0*r0 + adc $h2,$h2,$padbit + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + cbnz $len,.Loop + + stp $h0,$h1,[$ctx] // store hash value + stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + ldp $h0,$h1,[$ctx] // load hash base 2^64 + ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] + ldp $t0,$t1,[$nonce] // load nonce + +#ifdef __AARCH64EB__ + lsr $d0,$h0,#32 + mov w#$d1,w#$h0 + lsr $d2,$h1,#32 + mov w15,w#$h1 + lsr x16,$h2,#32 +#else + mov w#$d0,w#$h0 + lsr $d1,$h0,#32 + mov w#$d2,w#$h1 + lsr x15,$h1,#32 + mov w16,w#$h2 +#endif + + add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 + lsr $d1,$d2,#12 + adds $d0,$d0,$d2,lsl#52 + add $d1,$d1,x15,lsl#14 + adc $d1,$d1,xzr + lsr $d2,x16,#24 + adds $d1,$d1,x16,lsl#40 + adc $d2,$d2,xzr + + cmp $r0,#0 // is_base2_26? + csel $h0,$h0,$d0,eq // choose between radixes + csel $h1,$h1,$d1,eq + csel $h2,$h2,$d2,eq + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __AARCH64EB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __AARCH64EB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit,.-poly1305_emit +___ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); +my ($T0,$T1,$MASK) = map("v$_",(29..31)); + +my ($in2,$zeros)=("x16","x17"); +my $is_base2_26 = $zeros; # borrow + +$code.=<<___; +.type poly1305_mult,%function +.align 5 +poly1305_mult: + mul $d0,$h0,$r0 // h0*r0 + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + ret +.size poly1305_mult,.-poly1305_mult + +.type poly1305_splat,%function +.align 4 +poly1305_splat: + and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,$h0,#26,#26 + extr x14,$h1,$h0,#52 + and x14,x14,#0x03ffffff + ubfx x15,$h1,#14,#26 + extr x16,$h2,$h1,#40 + + str w12,[$ctx,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[$ctx,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[$ctx,#16*2] // s1 + str w14,[$ctx,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[$ctx,#16*4] // s2 + str w15,[$ctx,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[$ctx,#16*6] // s3 + str w16,[$ctx,#16*7] // r4 + str w15,[$ctx,#16*8] // s4 + + ret +.size poly1305_splat,.-poly1305_splat + +#ifdef __KERNEL__ +.globl poly1305_blocks_neon +#endif +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr $is_base2_26,[$ctx,#24] + cmp $len,#128 + b.lo .Lpoly1305_blocks + + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + cbz $is_base2_26,.Lbase2_64_neon + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + tst $len,#31 + b.eq .Leven_neon + + ldp $r0,$r1,[$ctx,#32] // load key value + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $d2,$h2,xzr // can be partially reduced... + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + b .Leven_neon + +.align 4 +.Lbase2_64_neon: + ldp $r0,$r1,[$ctx,#32] // load key value + + ldp $h0,$h1,[$ctx] // load hash value base 2^64 + ldr $h2,[$ctx,#16] + + tst $len,#31 + b.eq .Linit_neon + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + +.Linit_neon: + ldr w17,[$ctx,#48] // first table element + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + cmp w17,#-1 // is value impossible? + b.ne .Leven_neon + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + + ////////////////////////////////// initialize r^n table + mov $h0,$r0 // r^1 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + mov $h1,$r1 + mov $h2,xzr + add $ctx,$ctx,#48+12 + bl poly1305_splat + + bl poly1305_mult // r^2 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^3 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^4 + sub $ctx,$ctx,#4 + bl poly1305_splat + sub $ctx,$ctx,#48 // restore original $ctx + b .Ldo_neon + +.align 4 +.Leven_neon: + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + +.Ldo_neon: + ldp x8,x12,[$inp,#32] // inp[2:3] + subs $len,$len,#64 + ldp x9,x13,[$inp,#48] + add $in2,$inp,#96 + adr $zeros,.Lzeros + + lsl $padbit,$padbit,#24 + add x15,$ctx,#48 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN23_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN23_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov $IN23_2,x8 + fmov $IN23_3,x10 + fmov $IN23_4,x12 + + ldp x8,x12,[$inp],#16 // inp[0:1] + ldp x9,x13,[$inp],#48 + + ld1 {$R0,$R1,$S1,$R2},[x15],#64 + ld1 {$S2,$R3,$S3,$R4},[x15],#64 + ld1 {$S4},[x15] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN01_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN01_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi $MASK.2d,#-1 + fmov $IN01_2,x8 + fmov $IN01_3,x10 + fmov $IN01_4,x12 + ushr $MASK.2d,$MASK.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // \___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // \___________________/ \____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs $len,$len,#64 + umull $ACC4,$IN23_0,${R4}[2] + csel $in2,$zeros,$in2,lo + umull $ACC3,$IN23_0,${R3}[2] + umull $ACC2,$IN23_0,${R2}[2] + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + umull $ACC1,$IN23_0,${R1}[2] + ldp x9,x13,[$in2],#48 + umull $ACC0,$IN23_0,${R0}[2] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal $ACC4,$IN23_1,${R3}[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC3,$IN23_1,${R2}[2] + and x5,x9,#0x03ffffff + umlal $ACC2,$IN23_1,${R1}[2] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN23_1,${R0}[2] + ubfx x7,x9,#26,#26 + umlal $ACC0,$IN23_1,${S4}[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal $ACC4,$IN23_2,${R2}[2] + extr x8,x12,x8,#52 + umlal $ACC3,$IN23_2,${R1}[2] + extr x9,x13,x9,#52 + umlal $ACC2,$IN23_2,${R0}[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC1,$IN23_2,${S4}[2] + fmov $IN23_0,x4 + umlal $ACC0,$IN23_2,${S3}[2] + and x8,x8,#0x03ffffff + + umlal $ACC4,$IN23_3,${R1}[2] + and x9,x9,#0x03ffffff + umlal $ACC3,$IN23_3,${R0}[2] + ubfx x10,x12,#14,#26 + umlal $ACC2,$IN23_3,${S4}[2] + ubfx x11,x13,#14,#26 + umlal $ACC1,$IN23_3,${S3}[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC0,$IN23_3,${S2}[2] + fmov $IN23_1,x6 + + add $IN01_2,$IN01_2,$H2 + add x12,$padbit,x12,lsr#40 + umlal $ACC4,$IN23_4,${R0}[2] + add x13,$padbit,x13,lsr#40 + umlal $ACC3,$IN23_4,${S4}[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC2,$IN23_4,${S3}[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN23_4,${S2}[2] + fmov $IN23_2,x8 + umlal $ACC0,$IN23_4,${S1}[2] + fmov $IN23_3,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add $IN01_0,$IN01_0,$H0 + fmov $IN23_4,x12 + umlal $ACC3,$IN01_2,${R1}[0] + ldp x8,x12,[$inp],#16 // inp[0:1] + umlal $ACC0,$IN01_2,${S3}[0] + ldp x9,x13,[$inp],#48 + umlal $ACC4,$IN01_2,${R2}[0] + umlal $ACC1,$IN01_2,${S4}[0] + umlal $ACC2,$IN01_2,${R0}[0] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3}[0] + umlal $ACC4,$IN01_0,${R4}[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC2,$IN01_0,${R2}[0] + and x5,x9,#0x03ffffff + umlal $ACC0,$IN01_0,${R0}[0] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN01_0,${R1}[0] + ubfx x7,x9,#26,#26 + + add $IN01_3,$IN01_3,$H3 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal $ACC3,$IN01_1,${R2}[0] + extr x8,x12,x8,#52 + umlal $ACC4,$IN01_1,${R3}[0] + extr x9,x13,x9,#52 + umlal $ACC0,$IN01_1,${S4}[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC2,$IN01_1,${R1}[0] + fmov $IN01_0,x4 + umlal $ACC1,$IN01_1,${R0}[0] + and x8,x8,#0x03ffffff + + add $IN01_4,$IN01_4,$H4 + and x9,x9,#0x03ffffff + umlal $ACC3,$IN01_3,${R0}[0] + ubfx x10,x12,#14,#26 + umlal $ACC0,$IN01_3,${S2}[0] + ubfx x11,x13,#14,#26 + umlal $ACC4,$IN01_3,${R1}[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC1,$IN01_3,${S3}[0] + fmov $IN01_1,x6 + umlal $ACC2,$IN01_3,${S4}[0] + add x12,$padbit,x12,lsr#40 + + umlal $ACC3,$IN01_4,${S4}[0] + add x13,$padbit,x13,lsr#40 + umlal $ACC0,$IN01_4,${S1}[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC4,$IN01_4,${R0}[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN01_4,${S2}[0] + fmov $IN01_2,x8 + umlal $ACC2,$IN01_4,${S3}[0] + fmov $IN01_3,x10 + fmov $IN01_4,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr $T0.2d,$ACC3,#26 + xtn $H3,$ACC3 + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + bic $H3,#0xfc,lsl#24 // &=0x03ffffff + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + xtn $H4,$ACC4 + ushr $T1.2d,$ACC1,#26 + xtn $H1,$ACC1 + bic $H4,#0xfc,lsl#24 + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + shrn $T1.2s,$ACC2,#26 + xtn $H2,$ACC2 + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + bic $H1,#0xfc,lsl#24 + add $H3,$H3,$T1.2s // h2 -> h3 + bic $H2,#0xfc,lsl#24 + + shrn $T0.2s,$ACC0,#26 + xtn $H0,$ACC0 + ushr $T1.2s,$H3,#26 + bic $H3,#0xfc,lsl#24 + bic $H0,#0xfc,lsl#24 + add $H1,$H1,$T0.2s // h0 -> h1 + add $H4,$H4,$T1.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup $IN23_2,${IN23_2}[0] + add $IN01_2,$IN01_2,$H2 + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds $len,$len,#32 + b.ne .Long_tail + + dup $IN23_2,${IN01_2}[0] + add $IN23_0,$IN01_0,$H0 + add $IN23_3,$IN01_3,$H3 + add $IN23_1,$IN01_1,$H1 + add $IN23_4,$IN01_4,$H4 + +.Long_tail: + dup $IN23_0,${IN23_0}[0] + umull2 $ACC0,$IN23_2,${S3} + umull2 $ACC3,$IN23_2,${R1} + umull2 $ACC4,$IN23_2,${R2} + umull2 $ACC2,$IN23_2,${R0} + umull2 $ACC1,$IN23_2,${S4} + + dup $IN23_1,${IN23_1}[0] + umlal2 $ACC0,$IN23_0,${R0} + umlal2 $ACC2,$IN23_0,${R2} + umlal2 $ACC3,$IN23_0,${R3} + umlal2 $ACC4,$IN23_0,${R4} + umlal2 $ACC1,$IN23_0,${R1} + + dup $IN23_3,${IN23_3}[0] + umlal2 $ACC0,$IN23_1,${S4} + umlal2 $ACC3,$IN23_1,${R2} + umlal2 $ACC2,$IN23_1,${R1} + umlal2 $ACC4,$IN23_1,${R3} + umlal2 $ACC1,$IN23_1,${R0} + + dup $IN23_4,${IN23_4}[0] + umlal2 $ACC3,$IN23_3,${R0} + umlal2 $ACC4,$IN23_3,${R1} + umlal2 $ACC0,$IN23_3,${S2} + umlal2 $ACC1,$IN23_3,${S3} + umlal2 $ACC2,$IN23_3,${S4} + + umlal2 $ACC3,$IN23_4,${S4} + umlal2 $ACC0,$IN23_4,${S1} + umlal2 $ACC4,$IN23_4,${R0} + umlal2 $ACC1,$IN23_4,${S2} + umlal2 $ACC2,$IN23_4,${S3} + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add $IN01_0,$IN01_0,$H0 + umlal $ACC3,$IN01_2,${R1} + umlal $ACC0,$IN01_2,${S3} + umlal $ACC4,$IN01_2,${R2} + umlal $ACC1,$IN01_2,${S4} + umlal $ACC2,$IN01_2,${R0} + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3} + umlal $ACC0,$IN01_0,${R0} + umlal $ACC4,$IN01_0,${R4} + umlal $ACC1,$IN01_0,${R1} + umlal $ACC2,$IN01_0,${R2} + + add $IN01_3,$IN01_3,$H3 + umlal $ACC3,$IN01_1,${R2} + umlal $ACC0,$IN01_1,${S4} + umlal $ACC4,$IN01_1,${R3} + umlal $ACC1,$IN01_1,${R0} + umlal $ACC2,$IN01_1,${R1} + + add $IN01_4,$IN01_4,$H4 + umlal $ACC3,$IN01_3,${R0} + umlal $ACC0,$IN01_3,${S2} + umlal $ACC4,$IN01_3,${R1} + umlal $ACC1,$IN01_3,${S3} + umlal $ACC2,$IN01_3,${S4} + + umlal $ACC3,$IN01_4,${S4} + umlal $ACC0,$IN01_4,${S1} + umlal $ACC4,$IN01_4,${R0} + umlal $ACC1,$IN01_4,${S2} + umlal $ACC2,$IN01_4,${S3} + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp $ACC3,$ACC3,$ACC3 + ldp d8,d9,[sp,#16] // meet ABI requirements + addp $ACC0,$ACC0,$ACC0 + ldp d10,d11,[sp,#32] + addp $ACC4,$ACC4,$ACC4 + ldp d12,d13,[sp,#48] + addp $ACC1,$ACC1,$ACC1 + ldp d14,d15,[sp,#64] + addp $ACC2,$ACC2,$ACC2 + ldr x30,[sp,#8] + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr $T0.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + and $ACC4,$ACC4,$MASK.2d + ushr $T1.2d,$ACC1,#26 + and $ACC1,$ACC1,$MASK.2d + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + ushr $T1.2d,$ACC2,#26 + and $ACC2,$ACC2,$MASK.2d + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + add $ACC3,$ACC3,$T1.2d // h2 -> h3 + + ushr $T0.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + ushr $T1.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + add $ACC1,$ACC1,$T0.2d // h0 -> h1 + add $ACC4,$ACC4,$T1.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 + mov x4,#1 + st1 {$ACC4}[0],[$ctx] + str x4,[$ctx,#8] // set is_base2_26 + + ldr x29,[sp],#80 + .inst 0xd50323bf // autiasp + ret +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" +.align 2 +#if !defined(__KERNEL__) && !defined(_WIN64) +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +___ + +foreach (split("\n",$code)) { + s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or + s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or + (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or + (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or + (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or + (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or + (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); + + s/\.[124]([sd])\[/.$1\[/; + s/w#x([0-9]+)/w$1/g; + + print $_,"\n"; +} +close STDOUT; diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c new file mode 100644 index 000000000..1fae18ba1 --- /dev/null +++ b/arch/arm64/crypto/poly1305-glue.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 + * + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/poly1305.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/jump_label.h> +#include <linux/module.h> + +asmlinkage void poly1305_init_arm64(void *state, const u8 *key); +asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) +{ + poly1305_init_arm64(&dctx->h, key); + dctx->s[0] = get_unaligned_le32(key + 16); + dctx->s[1] = get_unaligned_le32(key + 20); + dctx->s[2] = get_unaligned_le32(key + 24); + dctx->s[3] = get_unaligned_le32(key + 28); + dctx->buflen = 0; +} +EXPORT_SYMBOL(poly1305_init_arch); + +static int neon_poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + dctx->buflen = 0; + dctx->rset = 0; + dctx->sset = false; + + return 0; +} + +static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, + u32 len, u32 hibit, bool do_neon) +{ + if (unlikely(!dctx->sset)) { + if (!dctx->rset) { + poly1305_init_arm64(&dctx->h, src); + src += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + dctx->rset = 1; + } + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(src + 0); + dctx->s[1] = get_unaligned_le32(src + 4); + dctx->s[2] = get_unaligned_le32(src + 8); + dctx->s[3] = get_unaligned_le32(src + 12); + src += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + if (len < POLY1305_BLOCK_SIZE) + return; + } + + len &= ~(POLY1305_BLOCK_SIZE - 1); + + if (static_branch_likely(&have_neon) && likely(do_neon)) + poly1305_blocks_neon(&dctx->h, src, len, hibit); + else + poly1305_blocks(&dctx->h, src, len, hibit); +} + +static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx, + const u8 *src, u32 len, bool do_neon) +{ + if (unlikely(dctx->buflen)) { + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen); + + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + len -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + neon_poly1305_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE, 1, false); + dctx->buflen = 0; + } + } + + if (likely(len >= POLY1305_BLOCK_SIZE)) { + neon_poly1305_blocks(dctx, src, len, 1, do_neon); + src += round_down(len, POLY1305_BLOCK_SIZE); + len %= POLY1305_BLOCK_SIZE; + } + + if (unlikely(len)) { + dctx->buflen = len; + memcpy(dctx->buf, src, len); + } +} + +static int neon_poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + bool do_neon = crypto_simd_usable() && srclen > 128; + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (static_branch_likely(&have_neon) && do_neon) + kernel_neon_begin(); + neon_poly1305_do_update(dctx, src, srclen, do_neon); + if (static_branch_likely(&have_neon) && do_neon) + kernel_neon_end(); + return 0; +} + +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, + unsigned int nbytes) +{ + if (unlikely(dctx->buflen)) { + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen); + + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + nbytes -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); + dctx->buflen = 0; + } + } + + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); + + if (static_branch_likely(&have_neon) && crypto_simd_usable()) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(&dctx->h, src, todo, 1); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else { + poly1305_blocks(&dctx->h, src, len, 1); + src += len; + } + nbytes %= POLY1305_BLOCK_SIZE; + } + + if (unlikely(nbytes)) { + dctx->buflen = nbytes; + memcpy(dctx->buf, src, nbytes); + } +} +EXPORT_SYMBOL(poly1305_update_arch); + +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) +{ + if (unlikely(dctx->buflen)) { + dctx->buf[dctx->buflen++] = 1; + memset(dctx->buf + dctx->buflen, 0, + POLY1305_BLOCK_SIZE - dctx->buflen); + poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); + } + + poly1305_emit(&dctx->h, dst, dctx->s); + memzero_explicit(dctx, sizeof(*dctx)); +} +EXPORT_SYMBOL(poly1305_final_arch); + +static int neon_poly1305_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_arch(dctx, dst); + return 0; +} + +static struct shash_alg neon_poly1305_alg = { + .init = neon_poly1305_init, + .update = neon_poly1305_update, + .final = neon_poly1305_final, + .digestsize = POLY1305_DIGEST_SIZE, + .descsize = sizeof(struct poly1305_desc_ctx), + + .base.cra_name = "poly1305", + .base.cra_driver_name = "poly1305-neon", + .base.cra_priority = 200, + .base.cra_blocksize = POLY1305_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}; + +static int __init neon_poly1305_mod_init(void) +{ + if (!cpu_have_named_feature(ASIMD)) + return 0; + + static_branch_enable(&have_neon); + + return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? + crypto_register_shash(&neon_poly1305_alg) : 0; +} + +static void __exit neon_poly1305_mod_exit(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD)) + crypto_unregister_shash(&neon_poly1305_alg); +} + +module_init(neon_poly1305_mod_init); +module_exit(neon_poly1305_mod_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("poly1305"); +MODULE_ALIAS_CRYPTO("poly1305-neon"); diff --git a/arch/arm64/crypto/polyval-ce-core.S b/arch/arm64/crypto/polyval-ce-core.S new file mode 100644 index 000000000..b5326540d --- /dev/null +++ b/arch/arm64/crypto/polyval-ce-core.S @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Implementation of POLYVAL using ARMv8 Crypto Extensions. + * + * Copyright 2021 Google LLC + */ +/* + * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions + * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8, + * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split + * finite field multiplication into two steps. + * + * In the first step, we consider h^i, m_i as normal polynomials of degree less + * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication + * is simply polynomial multiplication. + * + * In the second step, we compute the reduction of p(x) modulo the finite field + * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. + * + * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where + * multiplication is finite field multiplication. The advantage is that the + * two-step process only requires 1 finite field reduction for every 8 + * polynomial multiplications. Further parallelism is gained by interleaving the + * multiplications and polynomial reductions. + */ + +#include <linux/linkage.h> +#define STRIDE_BLOCKS 8 + +KEY_POWERS .req x0 +MSG .req x1 +BLOCKS_LEFT .req x2 +ACCUMULATOR .req x3 +KEY_START .req x10 +EXTRA_BYTES .req x11 +TMP .req x13 + +M0 .req v0 +M1 .req v1 +M2 .req v2 +M3 .req v3 +M4 .req v4 +M5 .req v5 +M6 .req v6 +M7 .req v7 +KEY8 .req v8 +KEY7 .req v9 +KEY6 .req v10 +KEY5 .req v11 +KEY4 .req v12 +KEY3 .req v13 +KEY2 .req v14 +KEY1 .req v15 +PL .req v16 +PH .req v17 +TMP_V .req v18 +LO .req v20 +MI .req v21 +HI .req v22 +SUM .req v23 +GSTAR .req v24 + + .text + + .arch armv8-a+crypto + .align 4 + +.Lgstar: + .quad 0xc200000000000000, 0xc200000000000000 + +/* + * Computes the product of two 128-bit polynomials in X and Y and XORs the + * components of the 256-bit product into LO, MI, HI. + * + * Given: + * X = [X_1 : X_0] + * Y = [Y_1 : Y_0] + * + * We compute: + * LO += X_0 * Y_0 + * MI += (X_0 + X_1) * (Y_0 + Y_1) + * HI += X_1 * Y_1 + * + * Later, the 256-bit result can be extracted as: + * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0] + * This step is done when computing the polynomial reduction for efficiency + * reasons. + * + * Karatsuba multiplication is used instead of Schoolbook multiplication because + * it was found to be slightly faster on ARM64 CPUs. + * + */ +.macro karatsuba1 X Y + X .req \X + Y .req \Y + ext v25.16b, X.16b, X.16b, #8 + ext v26.16b, Y.16b, Y.16b, #8 + eor v25.16b, v25.16b, X.16b + eor v26.16b, v26.16b, Y.16b + pmull2 v28.1q, X.2d, Y.2d + pmull v29.1q, X.1d, Y.1d + pmull v27.1q, v25.1d, v26.1d + eor HI.16b, HI.16b, v28.16b + eor LO.16b, LO.16b, v29.16b + eor MI.16b, MI.16b, v27.16b + .unreq X + .unreq Y +.endm + +/* + * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into + * them. + */ +.macro karatsuba1_store X Y + X .req \X + Y .req \Y + ext v25.16b, X.16b, X.16b, #8 + ext v26.16b, Y.16b, Y.16b, #8 + eor v25.16b, v25.16b, X.16b + eor v26.16b, v26.16b, Y.16b + pmull2 HI.1q, X.2d, Y.2d + pmull LO.1q, X.1d, Y.1d + pmull MI.1q, v25.1d, v26.1d + .unreq X + .unreq Y +.endm + +/* + * Computes the 256-bit polynomial represented by LO, HI, MI. Stores + * the result in PL, PH. + * [PH : PL] = + * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0] + */ +.macro karatsuba2 + // v4 = [HI_1 + MI_1 : HI_0 + MI_0] + eor v4.16b, HI.16b, MI.16b + // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0] + eor v4.16b, v4.16b, LO.16b + // v5 = [HI_0 : LO_1] + ext v5.16b, LO.16b, HI.16b, #8 + // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0] + eor v4.16b, v4.16b, v5.16b + // HI = [HI_0 : HI_1] + ext HI.16b, HI.16b, HI.16b, #8 + // LO = [LO_0 : LO_1] + ext LO.16b, LO.16b, LO.16b, #8 + // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1] + ext PH.16b, v4.16b, HI.16b, #8 + // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0] + ext PL.16b, LO.16b, v4.16b, #8 +.endm + +/* + * Computes the 128-bit reduction of PH : PL. Stores the result in dest. + * + * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = + * x^128 + x^127 + x^126 + x^121 + 1. + * + * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the + * product of two 128-bit polynomials in Montgomery form. We need to reduce it + * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor + * of x^128, this product has two extra factors of x^128. To get it back into + * Montgomery form, we need to remove one of these factors by dividing by x^128. + * + * To accomplish both of these goals, we add multiples of g(x) that cancel out + * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low + * bits are zero, the polynomial division by x^128 can be done by right + * shifting. + * + * Since the only nonzero term in the low 64 bits of g(x) is the constant term, + * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can + * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + + * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to + * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T + * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. + * + * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits + * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 + * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * + * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : + * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). + * + * So our final computation is: + * T = T_1 : T_0 = g*(x) * P_0 + * V = V_1 : V_0 = g*(x) * (P_1 + T_0) + * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 + * + * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 + * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : + * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. + */ +.macro montgomery_reduction dest + DEST .req \dest + // TMP_V = T_1 : T_0 = P_0 * g*(x) + pmull TMP_V.1q, PL.1d, GSTAR.1d + // TMP_V = T_0 : T_1 + ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 + // TMP_V = P_1 + T_0 : P_0 + T_1 + eor TMP_V.16b, PL.16b, TMP_V.16b + // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 + eor PH.16b, PH.16b, TMP_V.16b + // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x) + pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d + eor DEST.16b, PH.16b, TMP_V.16b + .unreq DEST +.endm + +/* + * Compute Polyval on 8 blocks. + * + * If reduce is set, also computes the montgomery reduction of the + * previous full_stride call and XORs with the first message block. + * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. + * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. + * + * Sets PL, PH. + */ +.macro full_stride reduce + eor LO.16b, LO.16b, LO.16b + eor MI.16b, MI.16b, MI.16b + eor HI.16b, HI.16b, HI.16b + + ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 + ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64 + + karatsuba1 M7 KEY1 + .if \reduce + pmull TMP_V.1q, PL.1d, GSTAR.1d + .endif + + karatsuba1 M6 KEY2 + .if \reduce + ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 + .endif + + karatsuba1 M5 KEY3 + .if \reduce + eor TMP_V.16b, PL.16b, TMP_V.16b + .endif + + karatsuba1 M4 KEY4 + .if \reduce + eor PH.16b, PH.16b, TMP_V.16b + .endif + + karatsuba1 M3 KEY5 + .if \reduce + pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d + .endif + + karatsuba1 M2 KEY6 + .if \reduce + eor SUM.16b, PH.16b, TMP_V.16b + .endif + + karatsuba1 M1 KEY7 + eor M0.16b, M0.16b, SUM.16b + + karatsuba1 M0 KEY8 + karatsuba2 +.endm + +/* + * Handle any extra blocks after full_stride loop. + */ +.macro partial_stride + add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4) + sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4 + ld1 {KEY1.16b}, [KEY_POWERS], #16 + + ld1 {TMP_V.16b}, [MSG], #16 + eor SUM.16b, SUM.16b, TMP_V.16b + karatsuba1_store KEY1 SUM + sub BLOCKS_LEFT, BLOCKS_LEFT, #1 + + tst BLOCKS_LEFT, #4 + beq .Lpartial4BlocksDone + ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 + ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 + karatsuba1 M0 KEY8 + karatsuba1 M1 KEY7 + karatsuba1 M2 KEY6 + karatsuba1 M3 KEY5 +.Lpartial4BlocksDone: + tst BLOCKS_LEFT, #2 + beq .Lpartial2BlocksDone + ld1 {M0.16b, M1.16b}, [MSG], #32 + ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32 + karatsuba1 M0 KEY8 + karatsuba1 M1 KEY7 +.Lpartial2BlocksDone: + tst BLOCKS_LEFT, #1 + beq .LpartialDone + ld1 {M0.16b}, [MSG], #16 + ld1 {KEY8.16b}, [KEY_POWERS], #16 + karatsuba1 M0 KEY8 +.LpartialDone: + karatsuba2 + montgomery_reduction SUM +.endm + +/* + * Perform montgomery multiplication in GF(2^128) and store result in op1. + * + * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 + * If op1, op2 are in montgomery form, this computes the montgomery + * form of op1*op2. + * + * void pmull_polyval_mul(u8 *op1, const u8 *op2); + */ +SYM_FUNC_START(pmull_polyval_mul) + adr TMP, .Lgstar + ld1 {GSTAR.2d}, [TMP] + ld1 {v0.16b}, [x0] + ld1 {v1.16b}, [x1] + karatsuba1_store v0 v1 + karatsuba2 + montgomery_reduction SUM + st1 {SUM.16b}, [x0] + ret +SYM_FUNC_END(pmull_polyval_mul) + +/* + * Perform polynomial evaluation as specified by POLYVAL. This computes: + * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} + * where n=nblocks, h is the hash key, and m_i are the message blocks. + * + * x0 - pointer to precomputed key powers h^8 ... h^1 + * x1 - pointer to message blocks + * x2 - number of blocks to hash + * x3 - pointer to accumulator + * + * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in, + * size_t nblocks, u8 *accumulator); + */ +SYM_FUNC_START(pmull_polyval_update) + adr TMP, .Lgstar + mov KEY_START, KEY_POWERS + ld1 {GSTAR.2d}, [TMP] + ld1 {SUM.16b}, [ACCUMULATOR] + subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + blt .LstrideLoopExit + ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 + ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64 + full_stride 0 + subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + blt .LstrideLoopExitReduce +.LstrideLoop: + full_stride 1 + subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + bge .LstrideLoop +.LstrideLoopExitReduce: + montgomery_reduction SUM +.LstrideLoopExit: + adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS + beq .LskipPartial + partial_stride +.LskipPartial: + st1 {SUM.16b}, [ACCUMULATOR] + ret +SYM_FUNC_END(pmull_polyval_update) diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c new file mode 100644 index 000000000..0a3b5718d --- /dev/null +++ b/arch/arm64/crypto/polyval-ce-glue.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Glue code for POLYVAL using ARMv8 Crypto Extensions + * + * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * Copyright 2021 Google LLC + */ + +/* + * Glue code based on ghash-clmulni-intel_glue.c. + * + * This implementation of POLYVAL uses montgomery multiplication accelerated by + * ARMv8 Crypto Extensions instructions to implement the finite field operations. + */ + +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/polyval.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <asm/neon.h> +#include <asm/simd.h> + +#define NUM_KEY_POWERS 8 + +struct polyval_tfm_ctx { + /* + * These powers must be in the order h^8, ..., h^1. + */ + u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE]; +}; + +struct polyval_desc_ctx { + u8 buffer[POLYVAL_BLOCK_SIZE]; + u32 bytes; +}; + +asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator); +asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2); + +static void internal_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator) +{ + if (likely(crypto_simd_usable())) { + kernel_neon_begin(); + pmull_polyval_update(keys, in, nblocks, accumulator); + kernel_neon_end(); + } else { + polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, + nblocks, accumulator); + } +} + +static void internal_polyval_mul(u8 *op1, const u8 *op2) +{ + if (likely(crypto_simd_usable())) { + kernel_neon_begin(); + pmull_polyval_mul(op1, op2); + kernel_neon_end(); + } else { + polyval_mul_non4k(op1, op2); + } +} + +static int polyval_arm64_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm); + int i; + + if (keylen != POLYVAL_BLOCK_SIZE) + return -EINVAL; + + memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); + + for (i = NUM_KEY_POWERS-2; i >= 0; i--) { + memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); + internal_polyval_mul(tctx->key_powers[i], + tctx->key_powers[i+1]); + } + + return 0; +} + +static int polyval_arm64_init(struct shash_desc *desc) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int polyval_arm64_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + u8 *pos; + unsigned int nblocks; + unsigned int n; + + if (dctx->bytes) { + n = min(srclen, dctx->bytes); + pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + while (srclen >= POLYVAL_BLOCK_SIZE) { + /* allow rescheduling every 4K bytes */ + nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; + internal_polyval_update(tctx, src, nblocks, dctx->buffer); + srclen -= nblocks * POLYVAL_BLOCK_SIZE; + src += nblocks * POLYVAL_BLOCK_SIZE; + } + + if (srclen) { + dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; + pos = dctx->buffer; + while (srclen--) + *pos++ ^= *src++; + } + + return 0; +} + +static int polyval_arm64_final(struct shash_desc *desc, u8 *dst) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + + if (dctx->bytes) { + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg polyval_alg = { + .digestsize = POLYVAL_DIGEST_SIZE, + .init = polyval_arm64_init, + .update = polyval_arm64_update, + .final = polyval_arm64_final, + .setkey = polyval_arm64_setkey, + .descsize = sizeof(struct polyval_desc_ctx), + .base = { + .cra_name = "polyval", + .cra_driver_name = "polyval-ce", + .cra_priority = 200, + .cra_blocksize = POLYVAL_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct polyval_tfm_ctx), + .cra_module = THIS_MODULE, + }, +}; + +static int __init polyval_ce_mod_init(void) +{ + return crypto_register_shash(&polyval_alg); +} + +static void __exit polyval_ce_mod_exit(void) +{ + crypto_unregister_shash(&polyval_alg); +} + +module_cpu_feature_match(PMULL, polyval_ce_mod_init) +module_exit(polyval_ce_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions"); +MODULE_ALIAS_CRYPTO("polyval"); +MODULE_ALIAS_CRYPTO("polyval-ce"); diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S new file mode 100644 index 000000000..889ca0f89 --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + k0 .req v0 + k1 .req v1 + k2 .req v2 + k3 .req v3 + + t0 .req v4 + t1 .req v5 + + dga .req q6 + dgav .req v6 + dgb .req s7 + dgbv .req v7 + + dg0q .req q12 + dg0s .req s12 + dg0v .req v12 + dg1s .req s13 + dg1v .req v13 + dg2s .req s14 + + .macro add_only, op, ev, rc, s0, dg1 + .ifc \ev, ev + add t1.4s, v\s0\().4s, \rc\().4s + sha1h dg2s, dg0s + .ifnb \dg1 + sha1\op dg0q, \dg1, t0.4s + .else + sha1\op dg0q, dg1s, t0.4s + .endif + .else + .ifnb \s0 + add t0.4s, v\s0\().4s, \rc\().4s + .endif + sha1h dg1s, dg0s + sha1\op dg0q, dg2s, t1.4s + .endif + .endm + + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 + sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s + add_only \op, \ev, \rc, \s1, \dg1 + sha1su1 v\s0\().4s, v\s3\().4s + .endm + + .macro loadrc, k, val, tmp + movz \tmp, :abs_g0_nc:\val + movk \tmp, :abs_g1:\val + dup \k, \tmp + .endm + + /* + * int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + * int blocks) + */ +SYM_FUNC_START(sha1_ce_transform) + /* load round constants */ + loadrc k0.4s, 0x5a827999, w6 + loadrc k1.4s, 0x6ed9eba1, w6 + loadrc k2.4s, 0x8f1bbcdc, w6 + loadrc k3.4s, 0xca62c1d6, w6 + + /* load state */ + ld1 {dgav.4s}, [x0] + ldr dgb, [x0, #16] + + /* load sha1_ce_state::finalize */ + ldr_l w4, sha1_ce_offsetof_finalize, x4 + ldr w4, [x0, x4] + + /* load input */ +0: ld1 {v8.4s-v11.4s}, [x1], #64 + sub w2, w2, #1 + +CPU_LE( rev32 v8.16b, v8.16b ) +CPU_LE( rev32 v9.16b, v9.16b ) +CPU_LE( rev32 v10.16b, v10.16b ) +CPU_LE( rev32 v11.16b, v11.16b ) + +1: add t0.4s, v8.4s, k0.4s + mov dg0v.16b, dgav.16b + + add_update c, ev, k0, 8, 9, 10, 11, dgb + add_update c, od, k0, 9, 10, 11, 8 + add_update c, ev, k0, 10, 11, 8, 9 + add_update c, od, k0, 11, 8, 9, 10 + add_update c, ev, k1, 8, 9, 10, 11 + + add_update p, od, k1, 9, 10, 11, 8 + add_update p, ev, k1, 10, 11, 8, 9 + add_update p, od, k1, 11, 8, 9, 10 + add_update p, ev, k1, 8, 9, 10, 11 + add_update p, od, k2, 9, 10, 11, 8 + + add_update m, ev, k2, 10, 11, 8, 9 + add_update m, od, k2, 11, 8, 9, 10 + add_update m, ev, k2, 8, 9, 10, 11 + add_update m, od, k2, 9, 10, 11, 8 + add_update m, ev, k3, 10, 11, 8, 9 + + add_update p, od, k3, 11, 8, 9, 10 + add_only p, ev, k3, 9 + add_only p, od, k3, 10 + add_only p, ev, k3, 11 + add_only p, od + + /* update state */ + add dgbv.2s, dgbv.2s, dg1v.2s + add dgav.4s, dgav.4s, dg0v.4s + + cbz w2, 2f + cond_yield 3f, x5, x6 + b 0b + + /* + * Final block: add padding and total bit count. + * Skip if the input size was not a round multiple of the block size, + * the padding is handled by the C code in that case. + */ +2: cbz x4, 3f + ldr_l w4, sha1_ce_offsetof_count, x4 + ldr x4, [x0, x4] + movi v9.2d, #0 + mov x8, #0x80000000 + movi v10.2d, #0 + ror x7, x4, #29 // ror(lsl(x4, 3), 32) + fmov d8, x8 + mov x4, #0 + mov v11.d[0], xzr + mov v11.d[1], x7 + b 1b + + /* store new state */ +3: st1 {dgav.4s}, [x0] + str dgb, [x0, #16] + mov w0, w2 + ret +SYM_FUNC_END(sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c new file mode 100644 index 000000000..71fa4f112 --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/sha1.h> +#include <crypto/sha1_base.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha1"); + +struct sha1_ce_state { + struct sha1_state sst; + u32 finalize; +}; + +extern const u32 sha1_ce_offsetof_count; +extern const u32 sha1_ce_offsetof_finalize; + +asmlinkage int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + int blocks); + +static void __sha1_ce_transform(struct sha1_state *sst, u8 const *src, + int blocks) +{ + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha1_ce_transform(container_of(sst, struct sha1_ce_state, + sst), src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA1_BLOCK_SIZE; + blocks = rem; + } +} + +const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); +const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize); + +static int sha1_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha1_ce_state *sctx = shash_desc_ctx(desc); + + if (!crypto_simd_usable()) + return crypto_sha1_update(desc, data, len); + + sctx->finalize = 0; + sha1_base_do_update(desc, data, len, __sha1_ce_transform); + + return 0; +} + +static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct sha1_ce_state *sctx = shash_desc_ctx(desc); + bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE) && len; + + if (!crypto_simd_usable()) + return crypto_sha1_finup(desc, data, len, out); + + /* + * Allow the asm code to perform the finalization if there is no + * partial data and the input is a round multiple of the block size. + */ + sctx->finalize = finalize; + + sha1_base_do_update(desc, data, len, __sha1_ce_transform); + if (!finalize) + sha1_base_do_finalize(desc, __sha1_ce_transform); + return sha1_base_finish(desc, out); +} + +static int sha1_ce_final(struct shash_desc *desc, u8 *out) +{ + struct sha1_ce_state *sctx = shash_desc_ctx(desc); + + if (!crypto_simd_usable()) + return crypto_sha1_finup(desc, NULL, 0, out); + + sctx->finalize = 0; + sha1_base_do_finalize(desc, __sha1_ce_transform); + return sha1_base_finish(desc, out); +} + +static int sha1_ce_export(struct shash_desc *desc, void *out) +{ + struct sha1_ce_state *sctx = shash_desc_ctx(desc); + + memcpy(out, &sctx->sst, sizeof(struct sha1_state)); + return 0; +} + +static int sha1_ce_import(struct shash_desc *desc, const void *in) +{ + struct sha1_ce_state *sctx = shash_desc_ctx(desc); + + memcpy(&sctx->sst, in, sizeof(struct sha1_state)); + sctx->finalize = 0; + return 0; +} + +static struct shash_alg alg = { + .init = sha1_base_init, + .update = sha1_ce_update, + .final = sha1_ce_final, + .finup = sha1_ce_finup, + .import = sha1_ce_import, + .export = sha1_ce_export, + .descsize = sizeof(struct sha1_ce_state), + .statesize = sizeof(struct sha1_state), + .digestsize = SHA1_DIGEST_SIZE, + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-ce", + .cra_priority = 200, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init sha1_ce_mod_init(void) +{ + return crypto_register_shash(&alg); +} + +static void __exit sha1_ce_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_cpu_feature_match(SHA1, sha1_ce_mod_init); +module_exit(sha1_ce_mod_fini); diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S new file mode 100644 index 000000000..491179922 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + dga .req q20 + dgav .req v20 + dgb .req q21 + dgbv .req v21 + + t0 .req v22 + t1 .req v23 + + dg0q .req q24 + dg0v .req v24 + dg1q .req q25 + dg1v .req v25 + dg2q .req q26 + dg2v .req v26 + + .macro add_only, ev, rc, s0 + mov dg2v.16b, dg0v.16b + .ifeq \ev + add t1.4s, v\s0\().4s, \rc\().4s + sha256h dg0q, dg1q, t0.4s + sha256h2 dg1q, dg2q, t0.4s + .else + .ifnb \s0 + add t0.4s, v\s0\().4s, \rc\().4s + .endif + sha256h dg0q, dg1q, t1.4s + sha256h2 dg1q, dg2q, t1.4s + .endif + .endm + + .macro add_update, ev, rc, s0, s1, s2, s3 + sha256su0 v\s0\().4s, v\s1\().4s + add_only \ev, \rc, \s1 + sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s + .endm + + /* + * The SHA-256 round constants + */ + .section ".rodata", "a" + .align 4 +.Lsha2_rcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + /* + * void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, + * int blocks) + */ + .text +SYM_FUNC_START(sha2_ce_transform) + /* load round constants */ + adr_l x8, .Lsha2_rcon + ld1 { v0.4s- v3.4s}, [x8], #64 + ld1 { v4.4s- v7.4s}, [x8], #64 + ld1 { v8.4s-v11.4s}, [x8], #64 + ld1 {v12.4s-v15.4s}, [x8] + + /* load state */ + ld1 {dgav.4s, dgbv.4s}, [x0] + + /* load sha256_ce_state::finalize */ + ldr_l w4, sha256_ce_offsetof_finalize, x4 + ldr w4, [x0, x4] + + /* load input */ +0: ld1 {v16.4s-v19.4s}, [x1], #64 + sub w2, w2, #1 + +CPU_LE( rev32 v16.16b, v16.16b ) +CPU_LE( rev32 v17.16b, v17.16b ) +CPU_LE( rev32 v18.16b, v18.16b ) +CPU_LE( rev32 v19.16b, v19.16b ) + +1: add t0.4s, v16.4s, v0.4s + mov dg0v.16b, dgav.16b + mov dg1v.16b, dgbv.16b + + add_update 0, v1, 16, 17, 18, 19 + add_update 1, v2, 17, 18, 19, 16 + add_update 0, v3, 18, 19, 16, 17 + add_update 1, v4, 19, 16, 17, 18 + + add_update 0, v5, 16, 17, 18, 19 + add_update 1, v6, 17, 18, 19, 16 + add_update 0, v7, 18, 19, 16, 17 + add_update 1, v8, 19, 16, 17, 18 + + add_update 0, v9, 16, 17, 18, 19 + add_update 1, v10, 17, 18, 19, 16 + add_update 0, v11, 18, 19, 16, 17 + add_update 1, v12, 19, 16, 17, 18 + + add_only 0, v13, 17 + add_only 1, v14, 18 + add_only 0, v15, 19 + add_only 1 + + /* update state */ + add dgav.4s, dgav.4s, dg0v.4s + add dgbv.4s, dgbv.4s, dg1v.4s + + /* handled all input blocks? */ + cbz w2, 2f + cond_yield 3f, x5, x6 + b 0b + + /* + * Final block: add padding and total bit count. + * Skip if the input size was not a round multiple of the block size, + * the padding is handled by the C code in that case. + */ +2: cbz x4, 3f + ldr_l w4, sha256_ce_offsetof_count, x4 + ldr x4, [x0, x4] + movi v17.2d, #0 + mov x8, #0x80000000 + movi v18.2d, #0 + ror x7, x4, #29 // ror(lsl(x4, 3), 32) + fmov d16, x8 + mov x4, #0 + mov v19.d[0], xzr + mov v19.d[1], x7 + b 1b + + /* store new state */ +3: st1 {dgav.4s, dgbv.4s}, [x0] + mov w0, w2 + ret +SYM_FUNC_END(sha2_ce_transform) diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c new file mode 100644 index 000000000..c57a6119f --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/sha2.h> +#include <crypto/sha256_base.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha224"); +MODULE_ALIAS_CRYPTO("sha256"); + +struct sha256_ce_state { + struct sha256_state sst; + u32 finalize; +}; + +extern const u32 sha256_ce_offsetof_count; +extern const u32 sha256_ce_offsetof_finalize; + +asmlinkage int sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, + int blocks); + +static void __sha2_ce_transform(struct sha256_state *sst, u8 const *src, + int blocks) +{ + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha2_ce_transform(container_of(sst, struct sha256_ce_state, + sst), src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA256_BLOCK_SIZE; + blocks = rem; + } +} + +const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, + sst.count); +const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state, + finalize); + +asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks); + +static void __sha256_block_data_order(struct sha256_state *sst, u8 const *src, + int blocks) +{ + sha256_block_data_order(sst->state, src, blocks); +} + +static int sha256_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_ce_state *sctx = shash_desc_ctx(desc); + + if (!crypto_simd_usable()) + return sha256_base_do_update(desc, data, len, + __sha256_block_data_order); + + sctx->finalize = 0; + sha256_base_do_update(desc, data, len, __sha2_ce_transform); + + return 0; +} + +static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct sha256_ce_state *sctx = shash_desc_ctx(desc); + bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE) && len; + + if (!crypto_simd_usable()) { + if (len) + sha256_base_do_update(desc, data, len, + __sha256_block_data_order); + sha256_base_do_finalize(desc, __sha256_block_data_order); + return sha256_base_finish(desc, out); + } + + /* + * Allow the asm code to perform the finalization if there is no + * partial data and the input is a round multiple of the block size. + */ + sctx->finalize = finalize; + + sha256_base_do_update(desc, data, len, __sha2_ce_transform); + if (!finalize) + sha256_base_do_finalize(desc, __sha2_ce_transform); + return sha256_base_finish(desc, out); +} + +static int sha256_ce_final(struct shash_desc *desc, u8 *out) +{ + struct sha256_ce_state *sctx = shash_desc_ctx(desc); + + if (!crypto_simd_usable()) { + sha256_base_do_finalize(desc, __sha256_block_data_order); + return sha256_base_finish(desc, out); + } + + sctx->finalize = 0; + sha256_base_do_finalize(desc, __sha2_ce_transform); + return sha256_base_finish(desc, out); +} + +static int sha256_ce_export(struct shash_desc *desc, void *out) +{ + struct sha256_ce_state *sctx = shash_desc_ctx(desc); + + memcpy(out, &sctx->sst, sizeof(struct sha256_state)); + return 0; +} + +static int sha256_ce_import(struct shash_desc *desc, const void *in) +{ + struct sha256_ce_state *sctx = shash_desc_ctx(desc); + + memcpy(&sctx->sst, in, sizeof(struct sha256_state)); + sctx->finalize = 0; + return 0; +} + +static struct shash_alg algs[] = { { + .init = sha224_base_init, + .update = sha256_ce_update, + .final = sha256_ce_final, + .finup = sha256_ce_finup, + .export = sha256_ce_export, + .import = sha256_ce_import, + .descsize = sizeof(struct sha256_ce_state), + .statesize = sizeof(struct sha256_state), + .digestsize = SHA224_DIGEST_SIZE, + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ce", + .cra_priority = 200, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .init = sha256_base_init, + .update = sha256_ce_update, + .final = sha256_ce_final, + .finup = sha256_ce_finup, + .export = sha256_ce_export, + .import = sha256_ce_import, + .descsize = sizeof(struct sha256_ce_state), + .statesize = sizeof(struct sha256_state), + .digestsize = SHA256_DIGEST_SIZE, + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-ce", + .cra_priority = 200, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int __init sha2_ce_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha2_ce_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_cpu_feature_match(SHA2, sha2_ce_mod_init); +module_exit(sha2_ce_mod_fini); diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c new file mode 100644 index 000000000..9462f6088 --- /dev/null +++ b/arch/arm64/crypto/sha256-glue.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Linux/arm64 port of the OpenSSL SHA256 implementation for AArch64 + * + * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/sha2.h> +#include <crypto/sha256_base.h> +#include <linux/types.h> +#include <linux/string.h> + +MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash for arm64"); +MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha224"); +MODULE_ALIAS_CRYPTO("sha256"); + +asmlinkage void sha256_block_data_order(u32 *digest, const void *data, + unsigned int num_blks); +EXPORT_SYMBOL(sha256_block_data_order); + +static void __sha256_block_data_order(struct sha256_state *sst, u8 const *src, + int blocks) +{ + sha256_block_data_order(sst->state, src, blocks); +} + +asmlinkage void sha256_block_neon(u32 *digest, const void *data, + unsigned int num_blks); + +static void __sha256_block_neon(struct sha256_state *sst, u8 const *src, + int blocks) +{ + sha256_block_neon(sst->state, src, blocks); +} + +static int crypto_sha256_arm64_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_base_do_update(desc, data, len, + __sha256_block_data_order); +} + +static int crypto_sha256_arm64_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (len) + sha256_base_do_update(desc, data, len, + __sha256_block_data_order); + sha256_base_do_finalize(desc, __sha256_block_data_order); + + return sha256_base_finish(desc, out); +} + +static int crypto_sha256_arm64_final(struct shash_desc *desc, u8 *out) +{ + return crypto_sha256_arm64_finup(desc, NULL, 0, out); +} + +static struct shash_alg algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = crypto_sha256_arm64_update, + .final = crypto_sha256_arm64_final, + .finup = crypto_sha256_arm64_finup, + .descsize = sizeof(struct sha256_state), + .base.cra_name = "sha256", + .base.cra_driver_name = "sha256-arm64", + .base.cra_priority = 125, + .base.cra_blocksize = SHA256_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = crypto_sha256_arm64_update, + .final = crypto_sha256_arm64_final, + .finup = crypto_sha256_arm64_finup, + .descsize = sizeof(struct sha256_state), + .base.cra_name = "sha224", + .base.cra_driver_name = "sha224-arm64", + .base.cra_priority = 125, + .base.cra_blocksize = SHA224_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; + +static int sha256_update_neon(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + if (!crypto_simd_usable()) + return sha256_base_do_update(desc, data, len, + __sha256_block_data_order); + + while (len > 0) { + unsigned int chunk = len; + + /* + * Don't hog the CPU for the entire time it takes to process all + * input when running on a preemptible kernel, but process the + * data block by block instead. + */ + if (IS_ENABLED(CONFIG_PREEMPTION) && + chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE) + chunk = SHA256_BLOCK_SIZE - + sctx->count % SHA256_BLOCK_SIZE; + + kernel_neon_begin(); + sha256_base_do_update(desc, data, chunk, __sha256_block_neon); + kernel_neon_end(); + data += chunk; + len -= chunk; + } + return 0; +} + +static int sha256_finup_neon(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!crypto_simd_usable()) { + if (len) + sha256_base_do_update(desc, data, len, + __sha256_block_data_order); + sha256_base_do_finalize(desc, __sha256_block_data_order); + } else { + if (len) + sha256_update_neon(desc, data, len); + kernel_neon_begin(); + sha256_base_do_finalize(desc, __sha256_block_neon); + kernel_neon_end(); + } + return sha256_base_finish(desc, out); +} + +static int sha256_final_neon(struct shash_desc *desc, u8 *out) +{ + return sha256_finup_neon(desc, NULL, 0, out); +} + +static struct shash_alg neon_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_update_neon, + .final = sha256_final_neon, + .finup = sha256_finup_neon, + .descsize = sizeof(struct sha256_state), + .base.cra_name = "sha256", + .base.cra_driver_name = "sha256-arm64-neon", + .base.cra_priority = 150, + .base.cra_blocksize = SHA256_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_update_neon, + .final = sha256_final_neon, + .finup = sha256_finup_neon, + .descsize = sizeof(struct sha256_state), + .base.cra_name = "sha224", + .base.cra_driver_name = "sha224-arm64-neon", + .base.cra_priority = 150, + .base.cra_blocksize = SHA224_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; + +static int __init sha256_mod_init(void) +{ + int ret = crypto_register_shashes(algs, ARRAY_SIZE(algs)); + if (ret) + return ret; + + if (cpu_have_named_feature(ASIMD)) { + ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs)); + if (ret) + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + } + return ret; +} + +static void __exit sha256_mod_fini(void) +{ + if (cpu_have_named_feature(ASIMD)) + crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs)); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_init(sha256_mod_init); +module_exit(sha256_mod_fini); diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S new file mode 100644 index 000000000..9c77313f5 --- /dev/null +++ b/arch/arm64/crypto/sha3-ce-core.S @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + .set .Lv\b\().2d, \b + .set .Lv\b\().16b, \b + .endr + + /* + * ARMv8.2 Crypto Extensions instructions + */ + .macro eor3, rd, rn, rm, ra + .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) + .endm + + .macro rax1, rd, rn, rm + .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro bcax, rd, rn, rm, ra + .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) + .endm + + .macro xar, rd, rn, rm, imm6 + .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16) + .endm + + /* + * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size) + */ + .text +SYM_FUNC_START(sha3_ce_transform) + /* load state */ + add x8, x0, #32 + ld1 { v0.1d- v3.1d}, [x0] + ld1 { v4.1d- v7.1d}, [x8], #32 + ld1 { v8.1d-v11.1d}, [x8], #32 + ld1 {v12.1d-v15.1d}, [x8], #32 + ld1 {v16.1d-v19.1d}, [x8], #32 + ld1 {v20.1d-v23.1d}, [x8], #32 + ld1 {v24.1d}, [x8] + +0: sub w2, w2, #1 + mov w8, #24 + adr_l x9, .Lsha3_rcon + + /* load input */ + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v31.8b}, [x1], #24 + eor v0.8b, v0.8b, v25.8b + eor v1.8b, v1.8b, v26.8b + eor v2.8b, v2.8b, v27.8b + eor v3.8b, v3.8b, v28.8b + eor v4.8b, v4.8b, v29.8b + eor v5.8b, v5.8b, v30.8b + eor v6.8b, v6.8b, v31.8b + + tbnz x3, #6, 2f // SHA3-512 + + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v30.8b}, [x1], #16 + eor v7.8b, v7.8b, v25.8b + eor v8.8b, v8.8b, v26.8b + eor v9.8b, v9.8b, v27.8b + eor v10.8b, v10.8b, v28.8b + eor v11.8b, v11.8b, v29.8b + eor v12.8b, v12.8b, v30.8b + + tbnz x3, #4, 1f // SHA3-384 or SHA3-224 + + // SHA3-256 + ld1 {v25.8b-v28.8b}, [x1], #32 + eor v13.8b, v13.8b, v25.8b + eor v14.8b, v14.8b, v26.8b + eor v15.8b, v15.8b, v27.8b + eor v16.8b, v16.8b, v28.8b + b 3f + +1: tbz x3, #2, 3f // bit 2 cleared? SHA-384 + + // SHA3-224 + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b}, [x1], #8 + eor v13.8b, v13.8b, v25.8b + eor v14.8b, v14.8b, v26.8b + eor v15.8b, v15.8b, v27.8b + eor v16.8b, v16.8b, v28.8b + eor v17.8b, v17.8b, v29.8b + b 3f + + // SHA3-512 +2: ld1 {v25.8b-v26.8b}, [x1], #16 + eor v7.8b, v7.8b, v25.8b + eor v8.8b, v8.8b, v26.8b + +3: sub w8, w8, #1 + + eor3 v29.16b, v4.16b, v9.16b, v14.16b + eor3 v26.16b, v1.16b, v6.16b, v11.16b + eor3 v28.16b, v3.16b, v8.16b, v13.16b + eor3 v25.16b, v0.16b, v5.16b, v10.16b + eor3 v27.16b, v2.16b, v7.16b, v12.16b + eor3 v29.16b, v29.16b, v19.16b, v24.16b + eor3 v26.16b, v26.16b, v16.16b, v21.16b + eor3 v28.16b, v28.16b, v18.16b, v23.16b + eor3 v25.16b, v25.16b, v15.16b, v20.16b + eor3 v27.16b, v27.16b, v17.16b, v22.16b + + rax1 v30.2d, v29.2d, v26.2d // bc[0] + rax1 v26.2d, v26.2d, v28.2d // bc[2] + rax1 v28.2d, v28.2d, v25.2d // bc[4] + rax1 v25.2d, v25.2d, v27.2d // bc[1] + rax1 v27.2d, v27.2d, v29.2d // bc[3] + + eor v0.16b, v0.16b, v30.16b + xar v29.2d, v1.2d, v25.2d, (64 - 1) + xar v1.2d, v6.2d, v25.2d, (64 - 44) + xar v6.2d, v9.2d, v28.2d, (64 - 20) + xar v9.2d, v22.2d, v26.2d, (64 - 61) + xar v22.2d, v14.2d, v28.2d, (64 - 39) + xar v14.2d, v20.2d, v30.2d, (64 - 18) + xar v31.2d, v2.2d, v26.2d, (64 - 62) + xar v2.2d, v12.2d, v26.2d, (64 - 43) + xar v12.2d, v13.2d, v27.2d, (64 - 25) + xar v13.2d, v19.2d, v28.2d, (64 - 8) + xar v19.2d, v23.2d, v27.2d, (64 - 56) + xar v23.2d, v15.2d, v30.2d, (64 - 41) + xar v15.2d, v4.2d, v28.2d, (64 - 27) + xar v28.2d, v24.2d, v28.2d, (64 - 14) + xar v24.2d, v21.2d, v25.2d, (64 - 2) + xar v8.2d, v8.2d, v27.2d, (64 - 55) + xar v4.2d, v16.2d, v25.2d, (64 - 45) + xar v16.2d, v5.2d, v30.2d, (64 - 36) + xar v5.2d, v3.2d, v27.2d, (64 - 28) + xar v27.2d, v18.2d, v27.2d, (64 - 21) + xar v3.2d, v17.2d, v26.2d, (64 - 15) + xar v25.2d, v11.2d, v25.2d, (64 - 10) + xar v26.2d, v7.2d, v26.2d, (64 - 6) + xar v30.2d, v10.2d, v30.2d, (64 - 3) + + bcax v20.16b, v31.16b, v22.16b, v8.16b + bcax v21.16b, v8.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v31.16b, v24.16b + bcax v24.16b, v24.16b, v8.16b, v31.16b + + ld1r {v31.2d}, [x9], #8 + + bcax v17.16b, v25.16b, v19.16b, v3.16b + bcax v18.16b, v3.16b, v15.16b, v19.16b + bcax v19.16b, v19.16b, v16.16b, v15.16b + bcax v15.16b, v15.16b, v25.16b, v16.16b + bcax v16.16b, v16.16b, v3.16b, v25.16b + + bcax v10.16b, v29.16b, v12.16b, v26.16b + bcax v11.16b, v26.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v29.16b, v14.16b + bcax v14.16b, v14.16b, v26.16b, v29.16b + + bcax v7.16b, v30.16b, v9.16b, v4.16b + bcax v8.16b, v4.16b, v5.16b, v9.16b + bcax v9.16b, v9.16b, v6.16b, v5.16b + bcax v5.16b, v5.16b, v30.16b, v6.16b + bcax v6.16b, v6.16b, v4.16b, v30.16b + + bcax v3.16b, v27.16b, v0.16b, v28.16b + bcax v4.16b, v28.16b, v1.16b, v0.16b + bcax v0.16b, v0.16b, v2.16b, v1.16b + bcax v1.16b, v1.16b, v27.16b, v2.16b + bcax v2.16b, v2.16b, v28.16b, v27.16b + + eor v0.16b, v0.16b, v31.16b + + cbnz w8, 3b + cond_yield 4f, x8, x9 + cbnz w2, 0b + + /* save state */ +4: st1 { v0.1d- v3.1d}, [x0], #32 + st1 { v4.1d- v7.1d}, [x0], #32 + st1 { v8.1d-v11.1d}, [x0], #32 + st1 {v12.1d-v15.1d}, [x0], #32 + st1 {v16.1d-v19.1d}, [x0], #32 + st1 {v20.1d-v23.1d}, [x0], #32 + st1 {v24.1d}, [x0] + mov w0, w2 + ret +SYM_FUNC_END(sha3_ce_transform) + + .section ".rodata", "a" + .align 8 +.Lsha3_rcon: + .quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a + .quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001 + .quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a + .quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a + .quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089 + .quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080 + .quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081 + .quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c new file mode 100644 index 000000000..250e1377c --- /dev/null +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/sha3.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha3-224"); +MODULE_ALIAS_CRYPTO("sha3-256"); +MODULE_ALIAS_CRYPTO("sha3-384"); +MODULE_ALIAS_CRYPTO("sha3-512"); + +asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks, + int md_len); + +static int sha3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha3_state *sctx = shash_desc_ctx(desc); + unsigned int digest_size = crypto_shash_digestsize(desc->tfm); + + if (!crypto_simd_usable()) + return crypto_sha3_update(desc, data, len); + + if ((sctx->partial + len) >= sctx->rsiz) { + int blocks; + + if (sctx->partial) { + int p = sctx->rsiz - sctx->partial; + + memcpy(sctx->buf + sctx->partial, data, p); + kernel_neon_begin(); + sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size); + kernel_neon_end(); + + data += p; + len -= p; + sctx->partial = 0; + } + + blocks = len / sctx->rsiz; + len %= sctx->rsiz; + + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha3_ce_transform(sctx->st, data, blocks, + digest_size); + kernel_neon_end(); + data += (blocks - rem) * sctx->rsiz; + blocks = rem; + } + } + + if (len) { + memcpy(sctx->buf + sctx->partial, data, len); + sctx->partial += len; + } + return 0; +} + +static int sha3_final(struct shash_desc *desc, u8 *out) +{ + struct sha3_state *sctx = shash_desc_ctx(desc); + unsigned int digest_size = crypto_shash_digestsize(desc->tfm); + __le64 *digest = (__le64 *)out; + int i; + + if (!crypto_simd_usable()) + return crypto_sha3_final(desc, out); + + sctx->buf[sctx->partial++] = 0x06; + memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial); + sctx->buf[sctx->rsiz - 1] |= 0x80; + + kernel_neon_begin(); + sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size); + kernel_neon_end(); + + for (i = 0; i < digest_size / 8; i++) + put_unaligned_le64(sctx->st[i], digest++); + + if (digest_size & 4) + put_unaligned_le32(sctx->st[i], (__le32 *)digest); + + memzero_explicit(sctx, sizeof(*sctx)); + return 0; +} + +static struct shash_alg algs[] = { { + .digestsize = SHA3_224_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-224", + .base.cra_driver_name = "sha3-224-ce", + .base.cra_blocksize = SHA3_224_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}, { + .digestsize = SHA3_256_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-256", + .base.cra_driver_name = "sha3-256-ce", + .base.cra_blocksize = SHA3_256_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}, { + .digestsize = SHA3_384_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-384", + .base.cra_driver_name = "sha3-384-ce", + .base.cra_blocksize = SHA3_384_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}, { + .digestsize = SHA3_512_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-512", + .base.cra_driver_name = "sha3-512-ce", + .base.cra_blocksize = SHA3_512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +} }; + +static int __init sha3_neon_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha3_neon_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_cpu_feature_match(SHA3, sha3_neon_mod_init); +module_exit(sha3_neon_mod_fini); diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl new file mode 100644 index 000000000..35ec9ae99 --- /dev/null +++ b/arch/arm64/crypto/sha512-armv8.pl @@ -0,0 +1,786 @@ +#! /usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# SHA256-hw SHA256(*) SHA512 +# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +# Denver 2.01 10.5 (+26%) 6.70 (+8%) +# X-Gene 20.0 (+100%) 12.8 (+300%(***)) +# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +# +# (*) Software SHA256 results are of lesser relevance, presented +# mostly for informational purposes. +# (**) The result is a trade-off: it's possible to improve it by +# 10% (or by 1 cycle per round), but at the cost of 20% loss +# on Cortex-A53 (or by 4 cycles per round). +# (***) Super-impressive coefficients over gcc-generated code are +# indication of some compiler "pathology", most notably code +# generated with -mgeneral-regs-only is significantly faster +# and the gap is only 40-90%. +# +# October 2016. +# +# Originally it was reckoned that it makes no sense to implement NEON +# version of SHA256 for 64-bit processors. This is because performance +# improvement on most wide-spread Cortex-A5x processors was observed +# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +# observed that 32-bit NEON SHA256 performs significantly better than +# 64-bit scalar version on *some* of the more recent processors. As +# result 64-bit NEON version of SHA256 was added to provide best +# all-round performance. For example it executes ~30% faster on X-Gene +# and Mongoose. [For reference, NEON version of SHA512 is bound to +# deliver much less improvement, likely *negative* on Cortex-A5x. +# Which is why NEON support is limited to SHA256.] + +$output=pop; +$flavour=pop; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" $xlate $flavour $output"; + *STDOUT=*OUT; +} else { + open STDOUT,">$output"; +} + +if ($output =~ /512/) { + $BITS=512; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $reg_t="x"; +} else { + $BITS=256; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $reg_t="w"; +} + +$func="sha${BITS}_block_data_order"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); + $T0=@X[$i+3] if ($i<11); + +$code.=<<___ if ($i<16); +#ifndef __AARCH64EB__ + rev @X[$i],@X[$i] // $i +#endif +___ +$code.=<<___ if ($i<13 && ($i&1)); + ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___ if ($i==13); + ldp @X[14],@X[15],[$inp] +___ +$code.=<<___ if ($i>=14); + ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___ if ($i>0 && $i<16); + add $a,$a,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=11); + str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___ if ($i<15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` + and $t1,$f,$e + bic $t2,$g,$e + add $h,$h,@X[$i&15] // h+=X[i] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) + ror $T0,$a,#$Sigma0[0] + add $h,$h,$t1 // h+=Ch(e,f,g) + eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` + add $h,$h,$t0 // h+=Sigma1(e) + and $t3,$t3,$t2 // (b^c)&=(a^b) + add $d,$d,$h // d+=h + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + //add $h,$h,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + ror $T1,@X[($j+1)&15],#$sigma0[0] + and $t1,$f,$e + ror $T2,@X[($j+14)&15],#$sigma1[0] + bic $t2,$g,$e + ror $T0,$a,#$Sigma0[0] + add $h,$h,@X[$i&15] // h+=X[i] + eor $t0,$t0,$e,ror#$Sigma1[1] + eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) + eor $T0,$T0,$a,ror#$Sigma0[1] + add $h,$h,$t1 // h+=Ch(e,f,g) + and $t3,$t3,$t2 // (b^c)&=(a^b) + eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] + eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) + add $h,$h,$t0 // h+=Sigma1(e) + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) + eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) + add @X[$j],@X[$j],@X[($j+9)&15] + add $d,$d,$h // d+=h + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + add @X[$j],@X[$j],$T1 + add $h,$h,$t1 // h+=Sigma0(a) + add @X[$j],@X[$j],$T2 +___ + ($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#endif + +.text + +.extern OPENSSL_armcap_P +.globl $func +.type $func,%function +.align 6 +$func: +___ +$code.=<<___ if ($SZ==4); +#ifndef __KERNEL__ +# ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +# else + ldr x16,.LOPENSSL_armcap_P +# endif + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry + tst w16,#ARMV7_NEON + b.ne .Lneon_entry +#endif +___ +$code.=<<___; + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*$SZ + + ldp $A,$B,[$ctx] // load context + ldp $C,$D,[$ctx,#2*$SZ] + ldp $E,$F,[$ctx,#4*$SZ] + add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input + ldp $G,$H,[$ctx,#6*$SZ] + adr $Ktbl,.LK$BITS + stp $ctx,$num,[x29,#96] + +.Loop: + ldp @X[0],@X[1],[$inp],#2*$SZ + ldr $t2,[$Ktbl],#$SZ // *K++ + eor $t3,$B,$C // magic seed + str $inp,[x29,#112] +___ +for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + cbnz $t2,.Loop_16_xx + + ldp $ctx,$num,[x29,#96] + ldr $inp,[x29,#112] + sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind + + ldp @X[0],@X[1],[$ctx] + ldp @X[2],@X[3],[$ctx,#2*$SZ] + add $inp,$inp,#14*$SZ // advance input pointer + ldp @X[4],@X[5],[$ctx,#4*$SZ] + add $A,$A,@X[0] + ldp @X[6],@X[7],[$ctx,#6*$SZ] + add $B,$B,@X[1] + add $C,$C,@X[2] + add $D,$D,@X[3] + stp $A,$B,[$ctx] + add $E,$E,@X[4] + add $F,$F,@X[5] + stp $C,$D,[$ctx,#2*$SZ] + add $G,$G,@X[6] + add $H,$H,@X[7] + cmp $inp,$num + stp $E,$F,[$ctx,#4*$SZ] + stp $G,$H,[$ctx,#6*$SZ] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*$SZ + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size $func,.-$func + +.align 6 +.type .LK$BITS,%object +.LK$BITS: +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +___ +$code.=<<___; +.size .LK$BITS,.-.LK$BITS +#ifndef __KERNEL__ +.align 3 +.LOPENSSL_armcap_P: +# ifdef __ILP32__ + .long OPENSSL_armcap_P-. +# else + .quad OPENSSL_armcap_P-. +# endif +#endif +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +#ifndef __KERNEL__ +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +#endif +___ +} + +if ($SZ==4) { ######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + &ushr_32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] + eval(shift(@insns)); + &sli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T4,$T7,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T4,$T7,32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T5,$T7,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T7,$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_u32 ($T3,$T7,32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T6,@X[0],$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T7,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T6,@X[0],32-$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T5,@X[0],$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T6); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T5,@X[0],32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl], #16"); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T5); + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dhi($T5), &Dlo($T7)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + while($#insns>=1) { eval(shift(@insns)); } + &st1_32 ("{$T0}","[$Xfer], #16"); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_8 ("{@X[0]}","[$inp],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &rev32 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &st1_32 ("{$T0}","[$Xfer], #16"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past + '&and ($t1,$f,$e)', + '&bic ($t4,$g,$e)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&orr ($t1,$t1,$t4)', # Ch(e,f,g) + '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ror ($t0,$t0,"#$Sigma1[0]")', + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t0)', # h+=Sigma1(e) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&ror ($t4,$t4,"#$Sigma0[0]")', + '&add ($d,$d,$h)', # d+=h + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#ifdef __KERNEL__ +.globl sha256_block_neon +#endif +.type sha256_block_neon,%function +.align 4 +sha256_block_neon: +.Lneon_entry: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr $Ktbl,.LK256 + add $num,$inp,$num,lsl#6 // len to point at the end of inp + + ld1.8 {@X[0]},[$inp], #16 + ld1.8 {@X[1]},[$inp], #16 + ld1.8 {@X[2]},[$inp], #16 + ld1.8 {@X[3]},[$inp], #16 + ld1.32 {$T0},[$Ktbl], #16 + ld1.32 {$T1},[$Ktbl], #16 + ld1.32 {$T2},[$Ktbl], #16 + ld1.32 {$T3},[$Ktbl], #16 + rev32 @X[0],@X[0] // yes, even on + rev32 @X[1],@X[1] // big-endian + rev32 @X[2],@X[2] + rev32 @X[3],@X[3] + mov $Xfer,sp + add.32 $T0,$T0,@X[0] + add.32 $T1,$T1,@X[1] + add.32 $T2,$T2,@X[2] + st1.32 {$T0-$T1},[$Xfer], #32 + add.32 $T3,$T3,@X[3] + st1.32 {$T2-$T3},[$Xfer] + sub $Xfer,$Xfer,#32 + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldp $E,$F,[$ctx,#16] + ldp $G,$H,[$ctx,#24] + ldr $t1,[sp,#0] + mov $t2,wzr + eor $t3,$B,$C + mov $t4,wzr + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + cmp $t1,#0 // check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + sub $Ktbl,$Ktbl,#256 // rewind $Ktbl + cmp $inp,$num + mov $Xfer, #64 + csel $Xfer, $Xfer, xzr, eq + sub $inp,$inp,$Xfer // avoid SEGV + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + add $A,$A,$t4 // h+=Sigma0(a) from the past + ldp $t0,$t1,[$ctx,#0] + add $A,$A,$t2 // h+=Maj(a,b,c) from the past + ldp $t2,$t3,[$ctx,#8] + add $A,$A,$t0 // accumulate + add $B,$B,$t1 + ldp $t0,$t1,[$ctx,#16] + add $C,$C,$t2 + add $D,$D,$t3 + ldp $t2,$t3,[$ctx,#24] + add $E,$E,$t0 + add $F,$F,$t1 + ldr $t1,[sp,#0] + stp $A,$B,[$ctx,#0] + add $G,$G,$t2 + mov $t2,wzr + stp $C,$D,[$ctx,#8] + add $H,$H,$t3 + stp $E,$F,[$ctx,#16] + eor $t3,$B,$C + stp $G,$H,[$ctx,#24] + mov $t4,wzr + mov $Xfer,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size sha256_block_neon,.-sha256_block_neon +___ +} + +$code.=<<___; +#ifndef __KERNEL__ +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers + + s/\.[ui]?8(\s)/$1/; + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; + + print $_,"\n"; +} + +close STDOUT; diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S new file mode 100644 index 000000000..b6a3a36e1 --- /dev/null +++ b/arch/arm64/crypto/sha512-ce-core.S @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 + .set .Lq\b, \b + .set .Lv\b\().2d, \b + .endr + + .macro sha512h, rd, rn, rm + .inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sha512h2, rd, rn, rm + .inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sha512su0, rd, rn + .inst 0xcec08000 | .L\rd | (.L\rn << 5) + .endm + + .macro sha512su1, rd, rn, rm + .inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + /* + * The SHA-512 round constants + */ + .section ".rodata", "a" + .align 4 +.Lsha512_rcon: + .quad 0x428a2f98d728ae22, 0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538, 0x59f111f1b605d019 + .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242, 0x12835b0145706fbe + .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235, 0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .quad 0x983e5152ee66dfab, 0xa831c66d2db43210 + .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .quad 0x06ca6351e003826f, 0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6, 0x92722c851482353b + .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .quad 0xd192e819d6ef5218, 0xd69906245565a910 + .quad 0xf40e35855771202a, 0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .quad 0x90befffa23631e28, 0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .quad 0xca273eceea26619c, 0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae, 0x1b710b35131c471b + .quad 0x28db77f523047d84, 0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 + + .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4 + .ifnb \rc1 + ld1 {v\rc1\().2d}, [x4], #16 + .endif + add v5.2d, v\rc0\().2d, v\in0\().2d + ext v6.16b, v\i2\().16b, v\i3\().16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + ext v7.16b, v\i1\().16b, v\i2\().16b, #8 + add v\i3\().2d, v\i3\().2d, v5.2d + .ifnb \in1 + ext v5.16b, v\in3\().16b, v\in4\().16b, #8 + sha512su0 v\in0\().2d, v\in1\().2d + .endif + sha512h q\i3, q6, v7.2d + .ifnb \in1 + sha512su1 v\in0\().2d, v\in2\().2d, v5.2d + .endif + add v\i4\().2d, v\i1\().2d, v\i3\().2d + sha512h2 q\i3, q\i1, v\i0\().2d + .endm + + /* + * void sha512_ce_transform(struct sha512_state *sst, u8 const *src, + * int blocks) + */ + .text +SYM_FUNC_START(sha512_ce_transform) + /* load state */ + ld1 {v8.2d-v11.2d}, [x0] + + /* load first 4 round constants */ + adr_l x3, .Lsha512_rcon + ld1 {v20.2d-v23.2d}, [x3], #64 + + /* load input */ +0: ld1 {v12.2d-v15.2d}, [x1], #64 + ld1 {v16.2d-v19.2d}, [x1], #64 + sub w2, w2, #1 + +CPU_LE( rev64 v12.16b, v12.16b ) +CPU_LE( rev64 v13.16b, v13.16b ) +CPU_LE( rev64 v14.16b, v14.16b ) +CPU_LE( rev64 v15.16b, v15.16b ) +CPU_LE( rev64 v16.16b, v16.16b ) +CPU_LE( rev64 v17.16b, v17.16b ) +CPU_LE( rev64 v18.16b, v18.16b ) +CPU_LE( rev64 v19.16b, v19.16b ) + + mov x4, x3 // rc pointer + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + // v0 ab cd -- ef gh ab + // v1 cd -- ef gh ab cd + // v2 ef gh ab cd -- ef + // v3 gh ab cd -- ef gh + // v4 -- ef gh ab cd -- + + dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17 + dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18 + dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19 + dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12 + dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13 + + dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14 + dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15 + dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16 + dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17 + dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18 + + dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19 + dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12 + dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13 + dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14 + dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15 + + dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16 + dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17 + dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18 + dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19 + dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12 + + dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13 + dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14 + dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15 + dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16 + dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17 + + dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18 + dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19 + dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12 + dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13 + dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14 + + dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15 + dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16 + dround 2, 3, 1, 4, 0, 28, 24, 12 + dround 4, 2, 0, 1, 3, 29, 25, 13 + dround 1, 4, 3, 0, 2, 30, 26, 14 + + dround 0, 1, 2, 3, 4, 31, 27, 15 + dround 3, 0, 4, 2, 1, 24, , 16 + dround 2, 3, 1, 4, 0, 25, , 17 + dround 4, 2, 0, 1, 3, 26, , 18 + dround 1, 4, 3, 0, 2, 27, , 19 + + /* update state */ + add v8.2d, v8.2d, v0.2d + add v9.2d, v9.2d, v1.2d + add v10.2d, v10.2d, v2.2d + add v11.2d, v11.2d, v3.2d + + cond_yield 3f, x4, x5 + /* handled all input blocks? */ + cbnz w2, 0b + + /* store new state */ +3: st1 {v8.2d-v11.2d}, [x0] + mov w0, w2 + ret +SYM_FUNC_END(sha512_ce_transform) diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c new file mode 100644 index 000000000..94cb7580d --- /dev/null +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/sha2.h> +#include <crypto/sha512_base.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha384"); +MODULE_ALIAS_CRYPTO("sha512"); + +asmlinkage int sha512_ce_transform(struct sha512_state *sst, u8 const *src, + int blocks); + +asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks); + +static void __sha512_ce_transform(struct sha512_state *sst, u8 const *src, + int blocks) +{ + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha512_ce_transform(sst, src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA512_BLOCK_SIZE; + blocks = rem; + } +} + +static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src, + int blocks) +{ + sha512_block_data_order(sst->state, src, blocks); +} + +static int sha512_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform + : __sha512_block_data_order; + + sha512_base_do_update(desc, data, len, fn); + return 0; +} + +static int sha512_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform + : __sha512_block_data_order; + + sha512_base_do_update(desc, data, len, fn); + sha512_base_do_finalize(desc, fn); + return sha512_base_finish(desc, out); +} + +static int sha512_ce_final(struct shash_desc *desc, u8 *out) +{ + sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform + : __sha512_block_data_order; + + sha512_base_do_finalize(desc, fn); + return sha512_base_finish(desc, out); +} + +static struct shash_alg algs[] = { { + .init = sha384_base_init, + .update = sha512_ce_update, + .final = sha512_ce_final, + .finup = sha512_ce_finup, + .descsize = sizeof(struct sha512_state), + .digestsize = SHA384_DIGEST_SIZE, + .base.cra_name = "sha384", + .base.cra_driver_name = "sha384-ce", + .base.cra_priority = 200, + .base.cra_blocksize = SHA512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .init = sha512_base_init, + .update = sha512_ce_update, + .final = sha512_ce_final, + .finup = sha512_ce_finup, + .descsize = sizeof(struct sha512_state), + .digestsize = SHA512_DIGEST_SIZE, + .base.cra_name = "sha512", + .base.cra_driver_name = "sha512-ce", + .base.cra_priority = 200, + .base.cra_blocksize = SHA512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; + +static int __init sha512_ce_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha512_ce_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_cpu_feature_match(SHA512, sha512_ce_mod_init); +module_exit(sha512_ce_mod_fini); diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c new file mode 100644 index 000000000..2acff1c7d --- /dev/null +++ b/arch/arm64/crypto/sha512-glue.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Linux/arm64 port of the OpenSSL SHA512 implementation for AArch64 + * + * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <crypto/internal/hash.h> +#include <linux/types.h> +#include <linux/string.h> +#include <crypto/sha2.h> +#include <crypto/sha512_base.h> +#include <asm/neon.h> + +MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64"); +MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha384"); +MODULE_ALIAS_CRYPTO("sha512"); + +asmlinkage void sha512_block_data_order(u64 *digest, const void *data, + unsigned int num_blks); +EXPORT_SYMBOL(sha512_block_data_order); + +static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src, + int blocks) +{ + sha512_block_data_order(sst->state, src, blocks); +} + +static int sha512_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha512_base_do_update(desc, data, len, + __sha512_block_data_order); +} + +static int sha512_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (len) + sha512_base_do_update(desc, data, len, + __sha512_block_data_order); + sha512_base_do_finalize(desc, __sha512_block_data_order); + + return sha512_base_finish(desc, out); +} + +static int sha512_final(struct shash_desc *desc, u8 *out) +{ + return sha512_finup(desc, NULL, 0, out); +} + +static struct shash_alg algs[] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_base_init, + .update = sha512_update, + .final = sha512_final, + .finup = sha512_finup, + .descsize = sizeof(struct sha512_state), + .base.cra_name = "sha512", + .base.cra_driver_name = "sha512-arm64", + .base.cra_priority = 150, + .base.cra_blocksize = SHA512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_base_init, + .update = sha512_update, + .final = sha512_final, + .finup = sha512_finup, + .descsize = sizeof(struct sha512_state), + .base.cra_name = "sha384", + .base.cra_driver_name = "sha384-arm64", + .base.cra_priority = 150, + .base.cra_blocksize = SHA384_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; + +static int __init sha512_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha512_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_init(sha512_mod_init); +module_exit(sha512_mod_fini); diff --git a/arch/arm64/crypto/sm3-ce-core.S b/arch/arm64/crypto/sm3-ce-core.S new file mode 100644 index 000000000..ca70cfacd --- /dev/null +++ b/arch/arm64/crypto/sm3-ce-core.S @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sm3-ce-core.S - SM3 secure hash using ARMv8.2 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <asm/assembler.h> + + .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 + .set .Lv\b\().4s, \b + .endr + + .macro sm3partw1, rd, rn, rm + .inst 0xce60c000 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sm3partw2, rd, rn, rm + .inst 0xce60c400 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sm3ss1, rd, rn, rm, ra + .inst 0xce400000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) + .endm + + .macro sm3tt1a, rd, rn, rm, imm2 + .inst 0xce408000 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) + .endm + + .macro sm3tt1b, rd, rn, rm, imm2 + .inst 0xce408400 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) + .endm + + .macro sm3tt2a, rd, rn, rm, imm2 + .inst 0xce408800 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) + .endm + + .macro sm3tt2b, rd, rn, rm, imm2 + .inst 0xce408c00 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) + .endm + + .macro round, ab, s0, t0, t1, i + sm3ss1 v5.4s, v8.4s, \t0\().4s, v9.4s + shl \t1\().4s, \t0\().4s, #1 + sri \t1\().4s, \t0\().4s, #31 + sm3tt1\ab v8.4s, v5.4s, v10.4s, \i + sm3tt2\ab v9.4s, v5.4s, \s0\().4s, \i + .endm + + .macro qround, ab, s0, s1, s2, s3, s4 + .ifnb \s4 + ext \s4\().16b, \s1\().16b, \s2\().16b, #12 + ext v6.16b, \s0\().16b, \s1\().16b, #12 + ext v7.16b, \s2\().16b, \s3\().16b, #8 + sm3partw1 \s4\().4s, \s0\().4s, \s3\().4s + .endif + + eor v10.16b, \s0\().16b, \s1\().16b + + round \ab, \s0, v11, v12, 0 + round \ab, \s0, v12, v11, 1 + round \ab, \s0, v11, v12, 2 + round \ab, \s0, v12, v11, 3 + + .ifnb \s4 + sm3partw2 \s4\().4s, v7.4s, v6.4s + .endif + .endm + + /* + * void sm3_ce_transform(struct sm3_state *sst, u8 const *src, + * int blocks) + */ + .text +SYM_TYPED_FUNC_START(sm3_ce_transform) + /* load state */ + ld1 {v8.4s-v9.4s}, [x0] + rev64 v8.4s, v8.4s + rev64 v9.4s, v9.4s + ext v8.16b, v8.16b, v8.16b, #8 + ext v9.16b, v9.16b, v9.16b, #8 + + adr_l x8, .Lt + ldp s13, s14, [x8] + + /* load input */ +0: ld1 {v0.16b-v3.16b}, [x1], #64 + sub w2, w2, #1 + + mov v15.16b, v8.16b + mov v16.16b, v9.16b + +CPU_LE( rev32 v0.16b, v0.16b ) +CPU_LE( rev32 v1.16b, v1.16b ) +CPU_LE( rev32 v2.16b, v2.16b ) +CPU_LE( rev32 v3.16b, v3.16b ) + + ext v11.16b, v13.16b, v13.16b, #4 + + qround a, v0, v1, v2, v3, v4 + qround a, v1, v2, v3, v4, v0 + qround a, v2, v3, v4, v0, v1 + qround a, v3, v4, v0, v1, v2 + + ext v11.16b, v14.16b, v14.16b, #4 + + qround b, v4, v0, v1, v2, v3 + qround b, v0, v1, v2, v3, v4 + qround b, v1, v2, v3, v4, v0 + qround b, v2, v3, v4, v0, v1 + qround b, v3, v4, v0, v1, v2 + qround b, v4, v0, v1, v2, v3 + qround b, v0, v1, v2, v3, v4 + qround b, v1, v2, v3, v4, v0 + qround b, v2, v3, v4, v0, v1 + qround b, v3, v4 + qround b, v4, v0 + qround b, v0, v1 + + eor v8.16b, v8.16b, v15.16b + eor v9.16b, v9.16b, v16.16b + + /* handled all input blocks? */ + cbnz w2, 0b + + /* save state */ + rev64 v8.4s, v8.4s + rev64 v9.4s, v9.4s + ext v8.16b, v8.16b, v8.16b, #8 + ext v9.16b, v9.16b, v9.16b, #8 + st1 {v8.4s-v9.4s}, [x0] + ret +SYM_FUNC_END(sm3_ce_transform) + + .section ".rodata", "a" + .align 3 +.Lt: .word 0x79cc4519, 0x9d8a7a87 diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c new file mode 100644 index 000000000..ee98954ae --- /dev/null +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sm3-ce-glue.c - SM3 secure hash using ARMv8.2 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/sm3.h> +#include <crypto/sm3_base.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src, + int blocks); + +static int sm3_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + if (!crypto_simd_usable()) { + sm3_update(shash_desc_ctx(desc), data, len); + return 0; + } + + kernel_neon_begin(); + sm3_base_do_update(desc, data, len, sm3_ce_transform); + kernel_neon_end(); + + return 0; +} + +static int sm3_ce_final(struct shash_desc *desc, u8 *out) +{ + if (!crypto_simd_usable()) { + sm3_final(shash_desc_ctx(desc), out); + return 0; + } + + kernel_neon_begin(); + sm3_base_do_finalize(desc, sm3_ce_transform); + kernel_neon_end(); + + return sm3_base_finish(desc, out); +} + +static int sm3_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!crypto_simd_usable()) { + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (len) + sm3_update(sctx, data, len); + sm3_final(sctx, out); + return 0; + } + + kernel_neon_begin(); + if (len) + sm3_base_do_update(desc, data, len, sm3_ce_transform); + sm3_base_do_finalize(desc, sm3_ce_transform); + kernel_neon_end(); + + return sm3_base_finish(desc, out); +} + +static struct shash_alg sm3_alg = { + .digestsize = SM3_DIGEST_SIZE, + .init = sm3_base_init, + .update = sm3_ce_update, + .final = sm3_ce_final, + .finup = sm3_ce_finup, + .descsize = sizeof(struct sm3_state), + .base.cra_name = "sm3", + .base.cra_driver_name = "sm3-ce", + .base.cra_blocksize = SM3_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}; + +static int __init sm3_ce_mod_init(void) +{ + return crypto_register_shash(&sm3_alg); +} + +static void __exit sm3_ce_mod_fini(void) +{ + crypto_unregister_shash(&sm3_alg); +} + +module_cpu_feature_match(SM3, sm3_ce_mod_init); +module_exit(sm3_ce_mod_fini); diff --git a/arch/arm64/crypto/sm3-neon-core.S b/arch/arm64/crypto/sm3-neon-core.S new file mode 100644 index 000000000..4357e0e51 --- /dev/null +++ b/arch/arm64/crypto/sm3-neon-core.S @@ -0,0 +1,601 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * sm3-neon-core.S - SM3 secure hash using NEON instructions + * + * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64 + * + * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <asm/assembler.h> + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 +#define state_h5 20 +#define state_h6 24 +#define state_h7 28 + +/* Stack structure */ + +#define STACK_W_SIZE (32 * 2 * 3) + +#define STACK_W (0) +#define STACK_SIZE (STACK_W + STACK_W_SIZE) + +/* Register macros */ + +#define RSTATE x0 +#define RDATA x1 +#define RNBLKS x2 +#define RKPTR x28 +#define RFRAME x29 + +#define ra w3 +#define rb w4 +#define rc w5 +#define rd w6 +#define re w7 +#define rf w8 +#define rg w9 +#define rh w10 + +#define t0 w11 +#define t1 w12 +#define t2 w13 +#define t3 w14 +#define t4 w15 +#define t5 w16 +#define t6 w17 + +#define k_even w19 +#define k_odd w20 + +#define addr0 x21 +#define addr1 x22 + +#define s0 w23 +#define s1 w24 +#define s2 w25 +#define s3 w26 + +#define W0 v0 +#define W1 v1 +#define W2 v2 +#define W3 v3 +#define W4 v4 +#define W5 v5 + +#define XTMP0 v6 +#define XTMP1 v7 +#define XTMP2 v16 +#define XTMP3 v17 +#define XTMP4 v18 +#define XTMP5 v19 +#define XTMP6 v20 + +/* Helper macros. */ + +#define _(...) /*_*/ + +#define clear_vec(x) \ + movi x.8h, #0; + +#define rolw(o, a, n) \ + ror o, a, #(32 - n); + +/* Round function macros. */ + +#define GG1_1(x, y, z, o, t) \ + eor o, x, y; +#define GG1_2(x, y, z, o, t) \ + eor o, o, z; +#define GG1_3(x, y, z, o, t) + +#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t) +#define FF1_2(x, y, z, o, t) +#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t) + +#define GG2_1(x, y, z, o, t) \ + bic o, z, x; +#define GG2_2(x, y, z, o, t) \ + and t, y, x; +#define GG2_3(x, y, z, o, t) \ + eor o, o, t; + +#define FF2_1(x, y, z, o, t) \ + eor o, x, y; +#define FF2_2(x, y, z, o, t) \ + and t, x, y; \ + and o, o, z; +#define FF2_3(x, y, z, o, t) \ + eor o, o, t; + +#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ + K_LOAD(round); \ + ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \ + rolw(t0, a, 12); /* rol(a, 12) => t0 */ \ + IOP(1, iop_param); \ + FF##i##_1(a, b, c, t1, t2); \ + ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \ + add k, k, e; \ + IOP(2, iop_param); \ + GG##i##_1(e, f, g, t3, t4); \ + FF##i##_2(a, b, c, t1, t2); \ + IOP(3, iop_param); \ + add k, k, t0; \ + add h, h, t5; \ + add d, d, t6; /* w1w2 + d => d */ \ + IOP(4, iop_param); \ + rolw(k, k, 7); /* rol (t0 + e + t), 7) => k */ \ + GG##i##_2(e, f, g, t3, t4); \ + add h, h, k; /* h + w1 + k => h */ \ + IOP(5, iop_param); \ + FF##i##_3(a, b, c, t1, t2); \ + eor t0, t0, k; /* k ^ t0 => t0 */ \ + GG##i##_3(e, f, g, t3, t4); \ + add d, d, t1; /* FF(a,b,c) + d => d */ \ + IOP(6, iop_param); \ + add t3, t3, h; /* GG(e,f,g) + h => t3 */ \ + rolw(b, b, 9); /* rol(b, 9) => b */ \ + eor h, t3, t3, ror #(32-9); \ + IOP(7, iop_param); \ + add d, d, t0; /* t0 + d => d */ \ + rolw(f, f, 19); /* rol(f, 19) => f */ \ + IOP(8, iop_param); \ + eor h, h, t3, ror #(32-17); /* P0(t3) => h */ + +#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ + R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) + +#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ + R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) + +#define KL(round) \ + ldp k_even, k_odd, [RKPTR, #(4*(round))]; + +/* Input expansion macros. */ + +/* Byte-swapped input address. */ +#define IW_W_ADDR(round, widx, offs) \ + (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4)) + +/* Expanded input address. */ +#define XW_W_ADDR(round, widx, offs) \ + (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4)) + +/* Rounds 1-12, byte-swapped input block addresses. */ +#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32) +#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48) + +/* Rounds 1-12, expanded input block addresses. */ +#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) +#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16) + +/* Input block loading. + * Interleaving within round function needed for in-order CPUs. */ +#define LOAD_W_VEC_1_1() \ + add addr0, sp, #IW_W1_ADDR(0, 0); +#define LOAD_W_VEC_1_2() \ + add addr1, sp, #IW_W1_ADDR(4, 0); +#define LOAD_W_VEC_1_3() \ + ld1 {W0.16b}, [RDATA], #16; +#define LOAD_W_VEC_1_4() \ + ld1 {W1.16b}, [RDATA], #16; +#define LOAD_W_VEC_1_5() \ + ld1 {W2.16b}, [RDATA], #16; +#define LOAD_W_VEC_1_6() \ + ld1 {W3.16b}, [RDATA], #16; +#define LOAD_W_VEC_1_7() \ + rev32 XTMP0.16b, W0.16b; +#define LOAD_W_VEC_1_8() \ + rev32 XTMP1.16b, W1.16b; +#define LOAD_W_VEC_2_1() \ + rev32 XTMP2.16b, W2.16b; +#define LOAD_W_VEC_2_2() \ + rev32 XTMP3.16b, W3.16b; +#define LOAD_W_VEC_2_3() \ + eor XTMP4.16b, XTMP1.16b, XTMP0.16b; +#define LOAD_W_VEC_2_4() \ + eor XTMP5.16b, XTMP2.16b, XTMP1.16b; +#define LOAD_W_VEC_2_5() \ + st1 {XTMP0.16b}, [addr0], #16; +#define LOAD_W_VEC_2_6() \ + st1 {XTMP4.16b}, [addr0]; \ + add addr0, sp, #IW_W1_ADDR(8, 0); +#define LOAD_W_VEC_2_7() \ + eor XTMP6.16b, XTMP3.16b, XTMP2.16b; +#define LOAD_W_VEC_2_8() \ + ext W0.16b, XTMP0.16b, XTMP0.16b, #8; /* W0: xx, w0, xx, xx */ +#define LOAD_W_VEC_3_1() \ + mov W2.16b, XTMP1.16b; /* W2: xx, w6, w5, w4 */ +#define LOAD_W_VEC_3_2() \ + st1 {XTMP1.16b}, [addr1], #16; +#define LOAD_W_VEC_3_3() \ + st1 {XTMP5.16b}, [addr1]; \ + ext W1.16b, XTMP0.16b, XTMP0.16b, #4; /* W1: xx, w3, w2, w1 */ +#define LOAD_W_VEC_3_4() \ + ext W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */ +#define LOAD_W_VEC_3_5() \ + ext W4.16b, XTMP2.16b, XTMP3.16b, #8; /* W4: xx, w12, w11, w10 */ +#define LOAD_W_VEC_3_6() \ + st1 {XTMP2.16b}, [addr0], #16; +#define LOAD_W_VEC_3_7() \ + st1 {XTMP6.16b}, [addr0]; +#define LOAD_W_VEC_3_8() \ + ext W5.16b, XTMP3.16b, XTMP3.16b, #4; /* W5: xx, w15, w14, w13 */ + +#define LOAD_W_VEC_1(iop_num, ...) \ + LOAD_W_VEC_1_##iop_num() +#define LOAD_W_VEC_2(iop_num, ...) \ + LOAD_W_VEC_2_##iop_num() +#define LOAD_W_VEC_3(iop_num, ...) \ + LOAD_W_VEC_3_##iop_num() + +/* Message scheduling. Note: 3 words per vector register. + * Interleaving within round function needed for in-order CPUs. */ +#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \ + /* Load (w[i - 16]) => XTMP0 */ \ + /* Load (w[i - 13]) => XTMP5 */ \ + ext XTMP0.16b, w0.16b, w0.16b, #12; /* XTMP0: w0, xx, xx, xx */ +#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \ + ext XTMP5.16b, w1.16b, w1.16b, #12; +#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \ + ext XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */ +#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \ + ext XTMP5.16b, XTMP5.16b, w2.16b, #12; +#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \ + /* w[i - 9] == w3 */ \ + /* W3 ^ XTMP0 => XTMP0 */ \ + eor XTMP0.16b, XTMP0.16b, w3.16b; +#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \ + /* w[i - 3] == w5 */ \ + /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ + /* rol(XTMP5, 7) => XTMP1 */ \ + add addr0, sp, #XW_W1_ADDR((round), 0); \ + shl XTMP2.4s, w5.4s, #15; +#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \ + shl XTMP1.4s, XTMP5.4s, #7; +#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \ + sri XTMP2.4s, w5.4s, #(32-15); +#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \ + sri XTMP1.4s, XTMP5.4s, #(32-7); +#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \ + eor XTMP0.16b, XTMP0.16b, XTMP2.16b; +#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \ + /* w[i - 6] == W4 */ \ + /* W4 ^ XTMP1 => XTMP1 */ \ + eor XTMP1.16b, XTMP1.16b, w4.16b; +#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \ + /* P1(XTMP0) ^ XTMP1 => W0 */ \ + shl XTMP3.4s, XTMP0.4s, #15; +#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \ + shl XTMP4.4s, XTMP0.4s, #23; +#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \ + eor w0.16b, XTMP1.16b, XTMP0.16b; +#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \ + sri XTMP3.4s, XTMP0.4s, #(32-15); +#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \ + sri XTMP4.4s, XTMP0.4s, #(32-23); +#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \ + eor w0.16b, w0.16b, XTMP3.16b; +#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \ + /* Load (w[i - 3]) => XTMP2 */ \ + ext XTMP2.16b, w4.16b, w4.16b, #12; +#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \ + eor w0.16b, w0.16b, XTMP4.16b; +#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \ + ext XTMP2.16b, XTMP2.16b, w5.16b, #12; +#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \ + /* W1 ^ W2 => XTMP3 */ \ + eor XTMP3.16b, XTMP2.16b, w0.16b; +#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5) +#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \ + st1 {XTMP2.16b-XTMP3.16b}, [addr0]; +#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5) + +#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5) +#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5) +#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5) + +#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0) +#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0) +#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0) + +#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1) +#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1) +#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1) + +#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2) +#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2) +#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2) + +#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3) +#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3) +#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3) + +#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4) +#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4) +#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4) + + + /* + * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'. + * + * void sm3_neon_transform(struct sm3_state *sst, u8 const *src, + * int blocks) + */ + .text +.align 3 +SYM_TYPED_FUNC_START(sm3_neon_transform) + ldp ra, rb, [RSTATE, #0] + ldp rc, rd, [RSTATE, #8] + ldp re, rf, [RSTATE, #16] + ldp rg, rh, [RSTATE, #24] + + stp x28, x29, [sp, #-16]! + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + mov RFRAME, sp + + sub addr0, sp, #STACK_SIZE + adr_l RKPTR, .LKtable + and sp, addr0, #(~63) + + /* Preload first block. */ + LOAD_W_VEC_1(1, 0) + LOAD_W_VEC_1(2, 0) + LOAD_W_VEC_1(3, 0) + LOAD_W_VEC_1(4, 0) + LOAD_W_VEC_1(5, 0) + LOAD_W_VEC_1(6, 0) + LOAD_W_VEC_1(7, 0) + LOAD_W_VEC_1(8, 0) + LOAD_W_VEC_2(1, 0) + LOAD_W_VEC_2(2, 0) + LOAD_W_VEC_2(3, 0) + LOAD_W_VEC_2(4, 0) + LOAD_W_VEC_2(5, 0) + LOAD_W_VEC_2(6, 0) + LOAD_W_VEC_2(7, 0) + LOAD_W_VEC_2(8, 0) + LOAD_W_VEC_3(1, 0) + LOAD_W_VEC_3(2, 0) + LOAD_W_VEC_3(3, 0) + LOAD_W_VEC_3(4, 0) + LOAD_W_VEC_3(5, 0) + LOAD_W_VEC_3(6, 0) + LOAD_W_VEC_3(7, 0) + LOAD_W_VEC_3(8, 0) + +.balign 16 +.Loop: + /* Transform 0-3 */ + R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0) + R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1, 1, IW, _, 0) + R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0) + R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3, 3, IW, _, 0) + + /* Transform 4-7 + Precalc 12-14 */ + R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0) + R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5, 1, IW, _, 0) + R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12) + R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12) + + /* Transform 8-11 + Precalc 12-17 */ + R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12) + R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15) + R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15) + R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15) + + /* Transform 12-14 + Precalc 18-20 */ + R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18) + R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18) + R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18) + + /* Transform 15-17 + Precalc 21-23 */ + R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21) + + /* Transform 18-20 + Precalc 24-26 */ + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24) + + /* Transform 21-23 + Precalc 27-29 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27) + + /* Transform 24-26 + Precalc 30-32 */ + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30) + + /* Transform 27-29 + Precalc 33-35 */ + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33) + + /* Transform 30-32 + Precalc 36-38 */ + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36) + + /* Transform 33-35 + Precalc 39-41 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39) + + /* Transform 36-38 + Precalc 42-44 */ + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42) + + /* Transform 39-41 + Precalc 45-47 */ + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45) + + /* Transform 42-44 + Precalc 48-50 */ + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48) + + /* Transform 45-47 + Precalc 51-53 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51) + + /* Transform 48-50 + Precalc 54-56 */ + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54) + + /* Transform 51-53 + Precalc 57-59 */ + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57) + + /* Transform 54-56 + Precalc 60-62 */ + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60) + + /* Transform 57-59 + Precalc 63 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63) + + /* Transform 60 */ + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _) + subs RNBLKS, RNBLKS, #1 + b.eq .Lend + + /* Transform 61-63 + Preload next block */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, LOAD_W_VEC_1, _) + ldp s0, s1, [RSTATE, #0] + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _) + ldp s2, s3, [RSTATE, #8] + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, LOAD_W_VEC_3, _) + + /* Update the chaining variables. */ + eor ra, ra, s0 + eor rb, rb, s1 + ldp s0, s1, [RSTATE, #16] + eor rc, rc, s2 + ldp k_even, k_odd, [RSTATE, #24] + eor rd, rd, s3 + eor re, re, s0 + stp ra, rb, [RSTATE, #0] + eor rf, rf, s1 + stp rc, rd, [RSTATE, #8] + eor rg, rg, k_even + stp re, rf, [RSTATE, #16] + eor rh, rh, k_odd + stp rg, rh, [RSTATE, #24] + b .Loop + +.Lend: + /* Transform 61-63 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, _, _) + ldp s0, s1, [RSTATE, #0] + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _) + ldp s2, s3, [RSTATE, #8] + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, _, _) + + /* Update the chaining variables. */ + eor ra, ra, s0 + clear_vec(W0) + eor rb, rb, s1 + clear_vec(W1) + ldp s0, s1, [RSTATE, #16] + clear_vec(W2) + eor rc, rc, s2 + clear_vec(W3) + ldp k_even, k_odd, [RSTATE, #24] + clear_vec(W4) + eor rd, rd, s3 + clear_vec(W5) + eor re, re, s0 + clear_vec(XTMP0) + stp ra, rb, [RSTATE, #0] + clear_vec(XTMP1) + eor rf, rf, s1 + clear_vec(XTMP2) + stp rc, rd, [RSTATE, #8] + clear_vec(XTMP3) + eor rg, rg, k_even + clear_vec(XTMP4) + stp re, rf, [RSTATE, #16] + clear_vec(XTMP5) + eor rh, rh, k_odd + clear_vec(XTMP6) + stp rg, rh, [RSTATE, #24] + + /* Clear message expansion area */ + add addr0, sp, #STACK_W + st1 {W0.16b-W3.16b}, [addr0], #64 + st1 {W0.16b-W3.16b}, [addr0], #64 + st1 {W0.16b-W3.16b}, [addr0] + + mov sp, RFRAME + + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ldp x28, x29, [sp], #16 + + ret +SYM_FUNC_END(sm3_neon_transform) + + + .section ".rodata", "a" + + .align 4 +.LKtable: + .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb + .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc + .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce + .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6 + .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c + .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce + .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec + .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 + .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53 + .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d + .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4 + .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43 + .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c + .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce + .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec + .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 diff --git a/arch/arm64/crypto/sm3-neon-glue.c b/arch/arm64/crypto/sm3-neon-glue.c new file mode 100644 index 000000000..7182ee683 --- /dev/null +++ b/arch/arm64/crypto/sm3-neon-glue.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * sm3-neon-glue.c - SM3 secure hash using NEON instructions + * + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/sm3.h> +#include <crypto/sm3_base.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + + +asmlinkage void sm3_neon_transform(struct sm3_state *sst, u8 const *src, + int blocks); + +static int sm3_neon_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + if (!crypto_simd_usable()) { + sm3_update(shash_desc_ctx(desc), data, len); + return 0; + } + + kernel_neon_begin(); + sm3_base_do_update(desc, data, len, sm3_neon_transform); + kernel_neon_end(); + + return 0; +} + +static int sm3_neon_final(struct shash_desc *desc, u8 *out) +{ + if (!crypto_simd_usable()) { + sm3_final(shash_desc_ctx(desc), out); + return 0; + } + + kernel_neon_begin(); + sm3_base_do_finalize(desc, sm3_neon_transform); + kernel_neon_end(); + + return sm3_base_finish(desc, out); +} + +static int sm3_neon_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!crypto_simd_usable()) { + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (len) + sm3_update(sctx, data, len); + sm3_final(sctx, out); + return 0; + } + + kernel_neon_begin(); + if (len) + sm3_base_do_update(desc, data, len, sm3_neon_transform); + sm3_base_do_finalize(desc, sm3_neon_transform); + kernel_neon_end(); + + return sm3_base_finish(desc, out); +} + +static struct shash_alg sm3_alg = { + .digestsize = SM3_DIGEST_SIZE, + .init = sm3_base_init, + .update = sm3_neon_update, + .final = sm3_neon_final, + .finup = sm3_neon_finup, + .descsize = sizeof(struct sm3_state), + .base.cra_name = "sm3", + .base.cra_driver_name = "sm3-neon", + .base.cra_blocksize = SM3_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}; + +static int __init sm3_neon_init(void) +{ + return crypto_register_shash(&sm3_alg); +} + +static void __exit sm3_neon_fini(void) +{ + crypto_unregister_shash(&sm3_alg); +} + +module_init(sm3_neon_init); +module_exit(sm3_neon_fini); + +MODULE_DESCRIPTION("SM3 secure hash using NEON instructions"); +MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>"); +MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/crypto/sm4-ce-cipher-core.S b/arch/arm64/crypto/sm4-ce-cipher-core.S new file mode 100644 index 000000000..4ac6cfbc5 --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-cipher-core.S @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8 + .set .Lv\b\().4s, \b + .endr + + .macro sm4e, rd, rn + .inst 0xcec08400 | .L\rd | (.L\rn << 5) + .endm + + /* + * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in); + */ + .text +SYM_FUNC_START(sm4_ce_do_crypt) + ld1 {v8.4s}, [x2] + ld1 {v0.4s-v3.4s}, [x0], #64 +CPU_LE( rev32 v8.16b, v8.16b ) + ld1 {v4.4s-v7.4s}, [x0] + sm4e v8.4s, v0.4s + sm4e v8.4s, v1.4s + sm4e v8.4s, v2.4s + sm4e v8.4s, v3.4s + sm4e v8.4s, v4.4s + sm4e v8.4s, v5.4s + sm4e v8.4s, v6.4s + sm4e v8.4s, v7.4s + rev64 v8.4s, v8.4s + ext v8.16b, v8.16b, v8.16b, #8 +CPU_LE( rev32 v8.16b, v8.16b ) + st1 {v8.4s}, [x1] + ret +SYM_FUNC_END(sm4_ce_do_crypt) diff --git a/arch/arm64/crypto/sm4-ce-cipher-glue.c b/arch/arm64/crypto/sm4-ce-cipher-glue.c new file mode 100644 index 000000000..76a34ef4a --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-cipher-glue.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/sm4.h> +#include <crypto/internal/simd.h> +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/types.h> + +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("sm4-ce"); +MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in); + +static int sm4_ce_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + if (!crypto_simd_usable()) { + sm4_crypt_block(ctx->rkey_enc, out, in); + } else { + kernel_neon_begin(); + sm4_ce_do_crypt(ctx->rkey_enc, out, in); + kernel_neon_end(); + } +} + +static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + if (!crypto_simd_usable()) { + sm4_crypt_block(ctx->rkey_dec, out, in); + } else { + kernel_neon_begin(); + sm4_ce_do_crypt(ctx->rkey_dec, out, in); + kernel_neon_end(); + } +} + +static struct crypto_alg sm4_ce_alg = { + .cra_name = "sm4", + .cra_driver_name = "sm4-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + .cra_u.cipher = { + .cia_min_keysize = SM4_KEY_SIZE, + .cia_max_keysize = SM4_KEY_SIZE, + .cia_setkey = sm4_ce_setkey, + .cia_encrypt = sm4_ce_encrypt, + .cia_decrypt = sm4_ce_decrypt + } +}; + +static int __init sm4_ce_mod_init(void) +{ + return crypto_register_alg(&sm4_ce_alg); +} + +static void __exit sm4_ce_mod_fini(void) +{ + crypto_unregister_alg(&sm4_ce_alg); +} + +module_cpu_feature_match(SM4, sm4_ce_mod_init); +module_exit(sm4_ce_mod_fini); diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S new file mode 100644 index 000000000..934e0f093 --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-core.S @@ -0,0 +1,660 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2022, Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +.arch armv8-a+crypto + +.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31 + .set .Lv\b\().4s, \b +.endr + +.macro sm4e, vd, vn + .inst 0xcec08400 | (.L\vn << 5) | .L\vd +.endm + +.macro sm4ekey, vd, vn, vm + .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd +.endm + +/* Register macros */ + +#define RTMP0 v16 +#define RTMP1 v17 +#define RTMP2 v18 +#define RTMP3 v19 + +#define RIV v20 + +/* Helper macros. */ + +#define PREPARE \ + ld1 {v24.16b-v27.16b}, [x0], #64; \ + ld1 {v28.16b-v31.16b}, [x0]; + +#define SM4_CRYPT_BLK(b0) \ + rev32 b0.16b, b0.16b; \ + sm4e b0.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + rev32 b0.16b, b0.16b; + +#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + sm4e b0.4s, v24.4s; \ + sm4e b1.4s, v24.4s; \ + sm4e b2.4s, v24.4s; \ + sm4e b3.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b1.4s, v25.4s; \ + sm4e b2.4s, v25.4s; \ + sm4e b3.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b1.4s, v26.4s; \ + sm4e b2.4s, v26.4s; \ + sm4e b3.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b1.4s, v27.4s; \ + sm4e b2.4s, v27.4s; \ + sm4e b3.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b1.4s, v28.4s; \ + sm4e b2.4s, v28.4s; \ + sm4e b3.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b1.4s, v29.4s; \ + sm4e b2.4s, v29.4s; \ + sm4e b3.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b1.4s, v30.4s; \ + sm4e b2.4s, v30.4s; \ + sm4e b3.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + sm4e b1.4s, v31.4s; \ + sm4e b2.4s, v31.4s; \ + sm4e b3.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + rev64 b1.4s, b1.4s; \ + rev64 b2.4s, b2.4s; \ + rev64 b3.4s, b3.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + ext b1.16b, b1.16b, b1.16b, #8; \ + ext b2.16b, b2.16b, b2.16b, #8; \ + ext b3.16b, b3.16b, b3.16b, #8; \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; + +#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; \ + sm4e b0.4s, v24.4s; \ + sm4e b1.4s, v24.4s; \ + sm4e b2.4s, v24.4s; \ + sm4e b3.4s, v24.4s; \ + sm4e b4.4s, v24.4s; \ + sm4e b5.4s, v24.4s; \ + sm4e b6.4s, v24.4s; \ + sm4e b7.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b1.4s, v25.4s; \ + sm4e b2.4s, v25.4s; \ + sm4e b3.4s, v25.4s; \ + sm4e b4.4s, v25.4s; \ + sm4e b5.4s, v25.4s; \ + sm4e b6.4s, v25.4s; \ + sm4e b7.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b1.4s, v26.4s; \ + sm4e b2.4s, v26.4s; \ + sm4e b3.4s, v26.4s; \ + sm4e b4.4s, v26.4s; \ + sm4e b5.4s, v26.4s; \ + sm4e b6.4s, v26.4s; \ + sm4e b7.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b1.4s, v27.4s; \ + sm4e b2.4s, v27.4s; \ + sm4e b3.4s, v27.4s; \ + sm4e b4.4s, v27.4s; \ + sm4e b5.4s, v27.4s; \ + sm4e b6.4s, v27.4s; \ + sm4e b7.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b1.4s, v28.4s; \ + sm4e b2.4s, v28.4s; \ + sm4e b3.4s, v28.4s; \ + sm4e b4.4s, v28.4s; \ + sm4e b5.4s, v28.4s; \ + sm4e b6.4s, v28.4s; \ + sm4e b7.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b1.4s, v29.4s; \ + sm4e b2.4s, v29.4s; \ + sm4e b3.4s, v29.4s; \ + sm4e b4.4s, v29.4s; \ + sm4e b5.4s, v29.4s; \ + sm4e b6.4s, v29.4s; \ + sm4e b7.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b1.4s, v30.4s; \ + sm4e b2.4s, v30.4s; \ + sm4e b3.4s, v30.4s; \ + sm4e b4.4s, v30.4s; \ + sm4e b5.4s, v30.4s; \ + sm4e b6.4s, v30.4s; \ + sm4e b7.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + sm4e b1.4s, v31.4s; \ + sm4e b2.4s, v31.4s; \ + sm4e b3.4s, v31.4s; \ + sm4e b4.4s, v31.4s; \ + sm4e b5.4s, v31.4s; \ + sm4e b6.4s, v31.4s; \ + sm4e b7.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + rev64 b1.4s, b1.4s; \ + rev64 b2.4s, b2.4s; \ + rev64 b3.4s, b3.4s; \ + rev64 b4.4s, b4.4s; \ + rev64 b5.4s, b5.4s; \ + rev64 b6.4s, b6.4s; \ + rev64 b7.4s, b7.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + ext b1.16b, b1.16b, b1.16b, #8; \ + ext b2.16b, b2.16b, b2.16b, #8; \ + ext b3.16b, b3.16b, b3.16b, #8; \ + ext b4.16b, b4.16b, b4.16b, #8; \ + ext b5.16b, b5.16b, b5.16b, #8; \ + ext b6.16b, b6.16b, b6.16b, #8; \ + ext b7.16b, b7.16b, b7.16b, #8; \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; + + +.align 3 +SYM_FUNC_START(sm4_ce_expand_key) + /* input: + * x0: 128-bit key + * x1: rkey_enc + * x2: rkey_dec + * x3: fk array + * x4: ck array + */ + ld1 {v0.16b}, [x0]; + rev32 v0.16b, v0.16b; + ld1 {v1.16b}, [x3]; + /* load ck */ + ld1 {v24.16b-v27.16b}, [x4], #64; + ld1 {v28.16b-v31.16b}, [x4]; + + /* input ^ fk */ + eor v0.16b, v0.16b, v1.16b; + + sm4ekey v0.4s, v0.4s, v24.4s; + sm4ekey v1.4s, v0.4s, v25.4s; + sm4ekey v2.4s, v1.4s, v26.4s; + sm4ekey v3.4s, v2.4s, v27.4s; + sm4ekey v4.4s, v3.4s, v28.4s; + sm4ekey v5.4s, v4.4s, v29.4s; + sm4ekey v6.4s, v5.4s, v30.4s; + sm4ekey v7.4s, v6.4s, v31.4s; + + st1 {v0.16b-v3.16b}, [x1], #64; + st1 {v4.16b-v7.16b}, [x1]; + rev64 v7.4s, v7.4s; + rev64 v6.4s, v6.4s; + rev64 v5.4s, v5.4s; + rev64 v4.4s, v4.4s; + rev64 v3.4s, v3.4s; + rev64 v2.4s, v2.4s; + rev64 v1.4s, v1.4s; + rev64 v0.4s, v0.4s; + ext v7.16b, v7.16b, v7.16b, #8; + ext v6.16b, v6.16b, v6.16b, #8; + ext v5.16b, v5.16b, v5.16b, #8; + ext v4.16b, v4.16b, v4.16b, #8; + ext v3.16b, v3.16b, v3.16b, #8; + ext v2.16b, v2.16b, v2.16b, #8; + ext v1.16b, v1.16b, v1.16b, #8; + ext v0.16b, v0.16b, v0.16b, #8; + st1 {v7.16b}, [x2], #16; + st1 {v6.16b}, [x2], #16; + st1 {v5.16b}, [x2], #16; + st1 {v4.16b}, [x2], #16; + st1 {v3.16b}, [x2], #16; + st1 {v2.16b}, [x2], #16; + st1 {v1.16b}, [x2], #16; + st1 {v0.16b}, [x2]; + + ret; +SYM_FUNC_END(sm4_ce_expand_key) + +.align 3 +SYM_FUNC_START(sm4_ce_crypt_block) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + */ + PREPARE; + + ld1 {v0.16b}, [x2]; + SM4_CRYPT_BLK(v0); + st1 {v0.16b}, [x1]; + + ret; +SYM_FUNC_END(sm4_ce_crypt_block) + +.align 3 +SYM_FUNC_START(sm4_ce_crypt) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * w3: nblocks + */ + PREPARE; + +.Lcrypt_loop_blk: + sub w3, w3, #8; + tbnz w3, #31, .Lcrypt_tail8; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b-v7.16b}, [x2], #64; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + st1 {v0.16b-v3.16b}, [x1], #64; + st1 {v4.16b-v7.16b}, [x1], #64; + + cbz w3, .Lcrypt_end; + b .Lcrypt_loop_blk; + +.Lcrypt_tail8: + add w3, w3, #8; + cmp w3, #4; + blt .Lcrypt_tail4; + + sub w3, w3, #4; + + ld1 {v0.16b-v3.16b}, [x2], #64; + SM4_CRYPT_BLK4(v0, v1, v2, v3); + st1 {v0.16b-v3.16b}, [x1], #64; + + cbz w3, .Lcrypt_end; + +.Lcrypt_tail4: + sub w3, w3, #1; + + ld1 {v0.16b}, [x2], #16; + SM4_CRYPT_BLK(v0); + st1 {v0.16b}, [x1], #16; + + cbnz w3, .Lcrypt_tail4; + +.Lcrypt_end: + ret; +SYM_FUNC_END(sm4_ce_crypt) + +.align 3 +SYM_FUNC_START(sm4_ce_cbc_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ld1 {RIV.16b}, [x3]; + +.Lcbc_enc_loop: + sub w4, w4, #1; + + ld1 {RTMP0.16b}, [x2], #16; + eor RIV.16b, RIV.16b, RTMP0.16b; + + SM4_CRYPT_BLK(RIV); + + st1 {RIV.16b}, [x1], #16; + + cbnz w4, .Lcbc_enc_loop; + + /* store new IV */ + st1 {RIV.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_cbc_enc) + +.align 3 +SYM_FUNC_START(sm4_ce_cbc_dec) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ld1 {RIV.16b}, [x3]; + +.Lcbc_loop_blk: + sub w4, w4, #8; + tbnz w4, #31, .Lcbc_tail8; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b-v7.16b}, [x2]; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + sub x2, x2, #64; + eor v0.16b, v0.16b, RIV.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v1.16b, v1.16b, RTMP0.16b; + eor v2.16b, v2.16b, RTMP1.16b; + eor v3.16b, v3.16b, RTMP2.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + eor v4.16b, v4.16b, RTMP3.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v5.16b, v5.16b, RTMP0.16b; + eor v6.16b, v6.16b, RTMP1.16b; + eor v7.16b, v7.16b, RTMP2.16b; + + mov RIV.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + cbz w4, .Lcbc_end; + b .Lcbc_loop_blk; + +.Lcbc_tail8: + add w4, w4, #8; + cmp w4, #4; + blt .Lcbc_tail4; + + sub w4, w4, #4; + + ld1 {v0.16b-v3.16b}, [x2]; + + SM4_CRYPT_BLK4(v0, v1, v2, v3); + + eor v0.16b, v0.16b, RIV.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v1.16b, v1.16b, RTMP0.16b; + eor v2.16b, v2.16b, RTMP1.16b; + eor v3.16b, v3.16b, RTMP2.16b; + + mov RIV.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + cbz w4, .Lcbc_end; + +.Lcbc_tail4: + sub w4, w4, #1; + + ld1 {v0.16b}, [x2]; + + SM4_CRYPT_BLK(v0); + + eor v0.16b, v0.16b, RIV.16b; + ld1 {RIV.16b}, [x2], #16; + st1 {v0.16b}, [x1], #16; + + cbnz w4, .Lcbc_tail4; + +.Lcbc_end: + /* store new IV */ + st1 {RIV.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_cbc_dec) + +.align 3 +SYM_FUNC_START(sm4_ce_cfb_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ld1 {RIV.16b}, [x3]; + +.Lcfb_enc_loop: + sub w4, w4, #1; + + SM4_CRYPT_BLK(RIV); + + ld1 {RTMP0.16b}, [x2], #16; + eor RIV.16b, RIV.16b, RTMP0.16b; + st1 {RIV.16b}, [x1], #16; + + cbnz w4, .Lcfb_enc_loop; + + /* store new IV */ + st1 {RIV.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_cfb_enc) + +.align 3 +SYM_FUNC_START(sm4_ce_cfb_dec) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ld1 {v0.16b}, [x3]; + +.Lcfb_loop_blk: + sub w4, w4, #8; + tbnz w4, #31, .Lcfb_tail8; + + ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; + ld1 {v4.16b-v7.16b}, [x2]; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + sub x2, x2, #48; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v4.16b, v4.16b, RTMP0.16b; + eor v5.16b, v5.16b, RTMP1.16b; + eor v6.16b, v6.16b, RTMP2.16b; + eor v7.16b, v7.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + mov v0.16b, RTMP3.16b; + + cbz w4, .Lcfb_end; + b .Lcfb_loop_blk; + +.Lcfb_tail8: + add w4, w4, #8; + cmp w4, #4; + blt .Lcfb_tail4; + + sub w4, w4, #4; + + ld1 {v1.16b, v2.16b, v3.16b}, [x2]; + + SM4_CRYPT_BLK4(v0, v1, v2, v3); + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + mov v0.16b, RTMP3.16b; + + cbz w4, .Lcfb_end; + +.Lcfb_tail4: + sub w4, w4, #1; + + SM4_CRYPT_BLK(v0); + + ld1 {RTMP0.16b}, [x2], #16; + eor v0.16b, v0.16b, RTMP0.16b; + st1 {v0.16b}, [x1], #16; + + mov v0.16b, RTMP0.16b; + + cbnz w4, .Lcfb_tail4; + +.Lcfb_end: + /* store new IV */ + st1 {v0.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_cfb_dec) + +.align 3 +SYM_FUNC_START(sm4_ce_ctr_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * w4: nblocks + */ + PREPARE; + + ldp x7, x8, [x3]; + rev x7, x7; + rev x8, x8; + +.Lctr_loop_blk: + sub w4, w4, #8; + tbnz w4, #31, .Lctr_tail8; + +#define inc_le128(vctr) \ + mov vctr.d[1], x8; \ + mov vctr.d[0], x7; \ + adds x8, x8, #1; \ + adc x7, x7, xzr; \ + rev64 vctr.16b, vctr.16b; + + /* construct CTRs */ + inc_le128(v0); /* +0 */ + inc_le128(v1); /* +1 */ + inc_le128(v2); /* +2 */ + inc_le128(v3); /* +3 */ + inc_le128(v4); /* +4 */ + inc_le128(v5); /* +5 */ + inc_le128(v6); /* +6 */ + inc_le128(v7); /* +7 */ + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v4.16b, v4.16b, RTMP0.16b; + eor v5.16b, v5.16b, RTMP1.16b; + eor v6.16b, v6.16b, RTMP2.16b; + eor v7.16b, v7.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + cbz w4, .Lctr_end; + b .Lctr_loop_blk; + +.Lctr_tail8: + add w4, w4, #8; + cmp w4, #4; + blt .Lctr_tail4; + + sub w4, w4, #4; + + /* construct CTRs */ + inc_le128(v0); /* +0 */ + inc_le128(v1); /* +1 */ + inc_le128(v2); /* +2 */ + inc_le128(v3); /* +3 */ + + SM4_CRYPT_BLK4(v0, v1, v2, v3); + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + cbz w4, .Lctr_end; + +.Lctr_tail4: + sub w4, w4, #1; + + /* construct CTRs */ + inc_le128(v0); + + SM4_CRYPT_BLK(v0); + + ld1 {RTMP0.16b}, [x2], #16; + eor v0.16b, v0.16b, RTMP0.16b; + st1 {v0.16b}, [x1], #16; + + cbnz w4, .Lctr_tail4; + +.Lctr_end: + /* store new CTR */ + rev x7, x7; + rev x8, x8; + stp x7, x8, [x3]; + + ret; +SYM_FUNC_END(sm4_ce_ctr_enc) diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c new file mode 100644 index 000000000..496d55c0d --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -0,0 +1,372 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, using ARMv8 Crypto Extensions + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2022, Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <linux/cpufeature.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/sm4.h> + +#define BYTES2BLKS(nbytes) ((nbytes) >> 4) + +asmlinkage void sm4_ce_expand_key(const u8 *key, u32 *rkey_enc, u32 *rkey_dec, + const u32 *fk, const u32 *ck); +asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src); +asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src, + unsigned int nblks); +asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_ce_ctr_enc(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); + +static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (key_len != SM4_KEY_SIZE) + return -EINVAL; + + sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec, + crypto_sm4_fk, crypto_sm4_ck); + return 0; +} + +static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) +{ + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_crypt(rkey, dst, src, nblks); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_ecb_do_crypt(req, ctx->rkey_enc); +} + +static int sm4_ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_ecb_do_crypt(req, ctx->rkey_dec); +} + +static int sm4_cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, walk.iv, nblks); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, walk.iv, nblks); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_cfb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_cfb_enc(ctx->rkey_enc, dst, src, walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_cfb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_cfb_dec(ctx->rkey_enc, dst, src, walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_ctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_ce_ctr_enc(ctx->rkey_enc, dst, src, walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static struct skcipher_alg sm4_algs[] = { + { + .base = { + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-ce", + .cra_priority = 400, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_ecb_encrypt, + .decrypt = sm4_ecb_decrypt, + }, { + .base = { + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-ce", + .cra_priority = 400, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_cbc_encrypt, + .decrypt = sm4_cbc_decrypt, + }, { + .base = { + .cra_name = "cfb(sm4)", + .cra_driver_name = "cfb-sm4-ce", + .cra_priority = 400, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_cfb_encrypt, + .decrypt = sm4_cfb_decrypt, + }, { + .base = { + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-ce", + .cra_priority = 400, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_ctr_crypt, + .decrypt = sm4_ctr_crypt, + } +}; + +static int __init sm4_init(void) +{ + return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); +} + +static void __exit sm4_exit(void) +{ + crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); +} + +module_cpu_feature_match(SM4, sm4_init); +module_exit(sm4_exit); + +MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 Crypto Extensions"); +MODULE_ALIAS_CRYPTO("sm4-ce"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("ecb(sm4)"); +MODULE_ALIAS_CRYPTO("cbc(sm4)"); +MODULE_ALIAS_CRYPTO("cfb(sm4)"); +MODULE_ALIAS_CRYPTO("ctr(sm4)"); +MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/crypto/sm4-neon-core.S b/arch/arm64/crypto/sm4-neon-core.S new file mode 100644 index 000000000..3d5256b35 --- /dev/null +++ b/arch/arm64/crypto/sm4-neon-core.S @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm for ARMv8 NEON + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2022, Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* Register macros */ + +#define RTMP0 v8 +#define RTMP1 v9 +#define RTMP2 v10 +#define RTMP3 v11 + +#define RX0 v12 +#define RX1 v13 +#define RKEY v14 +#define RIV v15 + +/* Helper macros. */ + +#define PREPARE \ + adr_l x5, crypto_sm4_sbox; \ + ld1 {v16.16b-v19.16b}, [x5], #64; \ + ld1 {v20.16b-v23.16b}, [x5], #64; \ + ld1 {v24.16b-v27.16b}, [x5], #64; \ + ld1 {v28.16b-v31.16b}, [x5]; + +#define transpose_4x4(s0, s1, s2, s3) \ + zip1 RTMP0.4s, s0.4s, s1.4s; \ + zip1 RTMP1.4s, s2.4s, s3.4s; \ + zip2 RTMP2.4s, s0.4s, s1.4s; \ + zip2 RTMP3.4s, s2.4s, s3.4s; \ + zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ + zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ + zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ + zip2 s3.2d, RTMP2.2d, RTMP3.2d; + +#define rotate_clockwise_90(s0, s1, s2, s3) \ + zip1 RTMP0.4s, s1.4s, s0.4s; \ + zip2 RTMP1.4s, s1.4s, s0.4s; \ + zip1 RTMP2.4s, s3.4s, s2.4s; \ + zip2 RTMP3.4s, s3.4s, s2.4s; \ + zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ + zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ + zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ + zip2 s3.2d, RTMP3.2d, RTMP1.2d; + +#define ROUND4(round, s0, s1, s2, s3) \ + dup RX0.4s, RKEY.s[round]; \ + /* rk ^ s1 ^ s2 ^ s3 */ \ + eor RTMP1.16b, s2.16b, s3.16b; \ + eor RX0.16b, RX0.16b, s1.16b; \ + eor RX0.16b, RX0.16b, RTMP1.16b; \ + \ + /* sbox, non-linear part */ \ + movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ + tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ + \ + /* linear part */ \ + shl RTMP1.4s, RTMP0.4s, #8; \ + shl RTMP2.4s, RTMP0.4s, #16; \ + shl RTMP3.4s, RTMP0.4s, #24; \ + sri RTMP1.4s, RTMP0.4s, #(32-8); \ + sri RTMP2.4s, RTMP0.4s, #(32-16); \ + sri RTMP3.4s, RTMP0.4s, #(32-24); \ + /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ + eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \ + eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \ + /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \ + eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \ + shl RTMP2.4s, RTMP1.4s, 2; \ + sri RTMP2.4s, RTMP1.4s, #(32-2); \ + eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \ + /* s0 ^= RTMP3 */ \ + eor s0.16b, s0.16b, RTMP3.16b; + +#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + \ + transpose_4x4(b0, b1, b2, b3); \ + \ + mov x6, 8; \ +4: \ + ld1 {RKEY.4s}, [x0], #16; \ + subs x6, x6, #1; \ + \ + ROUND4(0, b0, b1, b2, b3); \ + ROUND4(1, b1, b2, b3, b0); \ + ROUND4(2, b2, b3, b0, b1); \ + ROUND4(3, b3, b0, b1, b2); \ + \ + bne 4b; \ + \ + rotate_clockwise_90(b0, b1, b2, b3); \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + \ + /* repoint to rkey */ \ + sub x0, x0, #128; + +#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \ + /* rk ^ s1 ^ s2 ^ s3 */ \ + dup RX0.4s, RKEY.s[round]; \ + eor RTMP0.16b, s2.16b, s3.16b; \ + mov RX1.16b, RX0.16b; \ + eor RTMP1.16b, t2.16b, t3.16b; \ + eor RX0.16b, RX0.16b, s1.16b; \ + eor RX1.16b, RX1.16b, t1.16b; \ + eor RX0.16b, RX0.16b, RTMP0.16b; \ + eor RX1.16b, RX1.16b, RTMP1.16b; \ + \ + /* sbox, non-linear part */ \ + movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ + tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ + tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + sub RX1.16b, RX1.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ + tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + sub RX1.16b, RX1.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ + tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \ + sub RX0.16b, RX0.16b, RTMP3.16b; \ + sub RX1.16b, RX1.16b, RTMP3.16b; \ + tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ + tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \ + \ + /* linear part */ \ + shl RX0.4s, RTMP0.4s, #8; \ + shl RX1.4s, RTMP1.4s, #8; \ + shl RTMP2.4s, RTMP0.4s, #16; \ + shl RTMP3.4s, RTMP1.4s, #16; \ + sri RX0.4s, RTMP0.4s, #(32 - 8); \ + sri RX1.4s, RTMP1.4s, #(32 - 8); \ + sri RTMP2.4s, RTMP0.4s, #(32 - 16); \ + sri RTMP3.4s, RTMP1.4s, #(32 - 16); \ + /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ + eor RX0.16b, RX0.16b, RTMP0.16b; \ + eor RX1.16b, RX1.16b, RTMP1.16b; \ + eor RX0.16b, RX0.16b, RTMP2.16b; \ + eor RX1.16b, RX1.16b, RTMP3.16b; \ + /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \ + shl RTMP2.4s, RTMP0.4s, #24; \ + shl RTMP3.4s, RTMP1.4s, #24; \ + sri RTMP2.4s, RTMP0.4s, #(32 - 24); \ + sri RTMP3.4s, RTMP1.4s, #(32 - 24); \ + eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ + eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ + shl RTMP2.4s, RX0.4s, #2; \ + shl RTMP3.4s, RX1.4s, #2; \ + sri RTMP2.4s, RX0.4s, #(32 - 2); \ + sri RTMP3.4s, RX1.4s, #(32 - 2); \ + eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ + eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ + /* s0/t0 ^= RTMP0/1 */ \ + eor s0.16b, s0.16b, RTMP0.16b; \ + eor t0.16b, t0.16b, RTMP1.16b; + +#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; \ + \ + transpose_4x4(b0, b1, b2, b3); \ + transpose_4x4(b4, b5, b6, b7); \ + \ + mov x6, 8; \ +8: \ + ld1 {RKEY.4s}, [x0], #16; \ + subs x6, x6, #1; \ + \ + ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \ + ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \ + ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \ + ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \ + \ + bne 8b; \ + \ + rotate_clockwise_90(b0, b1, b2, b3); \ + rotate_clockwise_90(b4, b5, b6, b7); \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; \ + \ + /* repoint to rkey */ \ + sub x0, x0, #128; + + +.align 3 +SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * w3: num blocks (1..4) + */ + PREPARE; + + ld1 {v0.16b}, [x2], #16; + mov v1.16b, v0.16b; + mov v2.16b, v0.16b; + mov v3.16b, v0.16b; + cmp w3, #2; + blt .Lblk4_load_input_done; + ld1 {v1.16b}, [x2], #16; + beq .Lblk4_load_input_done; + ld1 {v2.16b}, [x2], #16; + cmp w3, #3; + beq .Lblk4_load_input_done; + ld1 {v3.16b}, [x2]; + +.Lblk4_load_input_done: + SM4_CRYPT_BLK4(v0, v1, v2, v3); + + st1 {v0.16b}, [x1], #16; + cmp w3, #2; + blt .Lblk4_store_output_done; + st1 {v1.16b}, [x1], #16; + beq .Lblk4_store_output_done; + st1 {v2.16b}, [x1], #16; + cmp w3, #3; + beq .Lblk4_store_output_done; + st1 {v3.16b}, [x1]; + +.Lblk4_store_output_done: + ret; +SYM_FUNC_END(__sm4_neon_crypt_blk1_4) + +.align 3 +SYM_FUNC_START(sm4_neon_crypt_blk1_8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * w3: num blocks (1..8) + */ + cmp w3, #5; + blt __sm4_neon_crypt_blk1_4; + + PREPARE; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b}, [x2], #16; + mov v5.16b, v4.16b; + mov v6.16b, v4.16b; + mov v7.16b, v4.16b; + beq .Lblk8_load_input_done; + ld1 {v5.16b}, [x2], #16; + cmp w3, #7; + blt .Lblk8_load_input_done; + ld1 {v6.16b}, [x2], #16; + beq .Lblk8_load_input_done; + ld1 {v7.16b}, [x2]; + +.Lblk8_load_input_done: + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + cmp w3, #6; + st1 {v0.16b-v3.16b}, [x1], #64; + st1 {v4.16b}, [x1], #16; + blt .Lblk8_store_output_done; + st1 {v5.16b}, [x1], #16; + beq .Lblk8_store_output_done; + st1 {v6.16b}, [x1], #16; + cmp w3, #7; + beq .Lblk8_store_output_done; + st1 {v7.16b}, [x1]; + +.Lblk8_store_output_done: + ret; +SYM_FUNC_END(sm4_neon_crypt_blk1_8) + +.align 3 +SYM_FUNC_START(sm4_neon_crypt_blk8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * w3: nblocks (multiples of 8) + */ + PREPARE; + +.Lcrypt_loop_blk: + subs w3, w3, #8; + bmi .Lcrypt_end; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b-v7.16b}, [x2], #64; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + st1 {v0.16b-v3.16b}, [x1], #64; + st1 {v4.16b-v7.16b}, [x1], #64; + + b .Lcrypt_loop_blk; + +.Lcrypt_end: + ret; +SYM_FUNC_END(sm4_neon_crypt_blk8) + +.align 3 +SYM_FUNC_START(sm4_neon_cbc_dec_blk8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks (multiples of 8) + */ + PREPARE; + + ld1 {RIV.16b}, [x3]; + +.Lcbc_loop_blk: + subs w4, w4, #8; + bmi .Lcbc_end; + + ld1 {v0.16b-v3.16b}, [x2], #64; + ld1 {v4.16b-v7.16b}, [x2]; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + sub x2, x2, #64; + eor v0.16b, v0.16b, RIV.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v1.16b, v1.16b, RTMP0.16b; + eor v2.16b, v2.16b, RTMP1.16b; + eor v3.16b, v3.16b, RTMP2.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + eor v4.16b, v4.16b, RTMP3.16b; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v5.16b, v5.16b, RTMP0.16b; + eor v6.16b, v6.16b, RTMP1.16b; + eor v7.16b, v7.16b, RTMP2.16b; + + mov RIV.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + b .Lcbc_loop_blk; + +.Lcbc_end: + /* store new IV */ + st1 {RIV.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_neon_cbc_dec_blk8) + +.align 3 +SYM_FUNC_START(sm4_neon_cfb_dec_blk8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nblocks (multiples of 8) + */ + PREPARE; + + ld1 {v0.16b}, [x3]; + +.Lcfb_loop_blk: + subs w4, w4, #8; + bmi .Lcfb_end; + + ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; + ld1 {v4.16b-v7.16b}, [x2]; + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + sub x2, x2, #48; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v4.16b, v4.16b, RTMP0.16b; + eor v5.16b, v5.16b, RTMP1.16b; + eor v6.16b, v6.16b, RTMP2.16b; + eor v7.16b, v7.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + mov v0.16b, RTMP3.16b; + + b .Lcfb_loop_blk; + +.Lcfb_end: + /* store new IV */ + st1 {v0.16b}, [x3]; + + ret; +SYM_FUNC_END(sm4_neon_cfb_dec_blk8) + +.align 3 +SYM_FUNC_START(sm4_neon_ctr_enc_blk8) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * w4: nblocks (multiples of 8) + */ + PREPARE; + + ldp x7, x8, [x3]; + rev x7, x7; + rev x8, x8; + +.Lctr_loop_blk: + subs w4, w4, #8; + bmi .Lctr_end; + +#define inc_le128(vctr) \ + mov vctr.d[1], x8; \ + mov vctr.d[0], x7; \ + adds x8, x8, #1; \ + adc x7, x7, xzr; \ + rev64 vctr.16b, vctr.16b; + + /* construct CTRs */ + inc_le128(v0); /* +0 */ + inc_le128(v1); /* +1 */ + inc_le128(v2); /* +2 */ + inc_le128(v3); /* +3 */ + inc_le128(v4); /* +4 */ + inc_le128(v5); /* +5 */ + inc_le128(v6); /* +6 */ + inc_le128(v7); /* +7 */ + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v0.16b, v0.16b, RTMP0.16b; + eor v1.16b, v1.16b, RTMP1.16b; + eor v2.16b, v2.16b, RTMP2.16b; + eor v3.16b, v3.16b, RTMP3.16b; + st1 {v0.16b-v3.16b}, [x1], #64; + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; + eor v4.16b, v4.16b, RTMP0.16b; + eor v5.16b, v5.16b, RTMP1.16b; + eor v6.16b, v6.16b, RTMP2.16b; + eor v7.16b, v7.16b, RTMP3.16b; + st1 {v4.16b-v7.16b}, [x1], #64; + + b .Lctr_loop_blk; + +.Lctr_end: + /* store new CTR */ + rev x7, x7; + rev x8, x8; + stp x7, x8, [x3]; + + ret; +SYM_FUNC_END(sm4_neon_ctr_enc_blk8) diff --git a/arch/arm64/crypto/sm4-neon-glue.c b/arch/arm64/crypto/sm4-neon-glue.c new file mode 100644 index 000000000..03a6a6866 --- /dev/null +++ b/arch/arm64/crypto/sm4-neon-glue.c @@ -0,0 +1,442 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, using ARMv8 NEON + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2022, Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <linux/cpufeature.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/sm4.h> + +#define BYTES2BLKS(nbytes) ((nbytes) >> 4) +#define BYTES2BLK8(nbytes) (((nbytes) >> 4) & ~(8 - 1)) + +asmlinkage void sm4_neon_crypt_blk1_8(const u32 *rkey, u8 *dst, const u8 *src, + unsigned int nblks); +asmlinkage void sm4_neon_crypt_blk8(const u32 *rkey, u8 *dst, const u8 *src, + unsigned int nblks); +asmlinkage void sm4_neon_cbc_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_neon_cfb_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); +asmlinkage void sm4_neon_ctr_enc_blk8(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblks); + +static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) +{ + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLK8(nbytes); + if (nblks) { + sm4_neon_crypt_blk8(rkey, dst, src, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + sm4_neon_crypt_blk1_8(rkey, dst, src, nblks); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_ecb_do_crypt(req, ctx->rkey_enc); +} + +static int sm4_ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_ecb_do_crypt(req, ctx->rkey_dec); +} + +static int sm4_cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE); + sm4_crypt_block(ctx->rkey_enc, dst, dst); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLK8(nbytes); + if (nblks) { + sm4_neon_cbc_dec_blk8(ctx->rkey_dec, dst, src, + walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + u8 iv[SM4_BLOCK_SIZE]; + int i; + + sm4_neon_crypt_blk1_8(ctx->rkey_dec, keystream, + src, nblks); + + src += ((int)nblks - 2) * SM4_BLOCK_SIZE; + dst += (nblks - 1) * SM4_BLOCK_SIZE; + memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); + + for (i = nblks - 1; i > 0; i--) { + crypto_xor_cpy(dst, src, + &keystream[i * SM4_BLOCK_SIZE], + SM4_BLOCK_SIZE); + src -= SM4_BLOCK_SIZE; + dst -= SM4_BLOCK_SIZE; + } + crypto_xor_cpy(dst, walk.iv, + keystream, SM4_BLOCK_SIZE); + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_cfb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + sm4_crypt_block(ctx->rkey_enc, keystream, iv); + crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_cfb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLK8(nbytes); + if (nblks) { + sm4_neon_cfb_dec_blk8(ctx->rkey_enc, dst, src, + walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + + memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); + if (nblks > 1) + memcpy(&keystream[SM4_BLOCK_SIZE], src, + (nblks - 1) * SM4_BLOCK_SIZE); + memcpy(walk.iv, src + (nblks - 1) * SM4_BLOCK_SIZE, + SM4_BLOCK_SIZE); + + sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream, + keystream, nblks); + + crypto_xor_cpy(dst, src, keystream, + nblks * SM4_BLOCK_SIZE); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int sm4_ctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int nblks; + + kernel_neon_begin(); + + nblks = BYTES2BLK8(nbytes); + if (nblks) { + sm4_neon_ctr_enc_blk8(ctx->rkey_enc, dst, src, + walk.iv, nblks); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + nblks = BYTES2BLKS(nbytes); + if (nblks) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + int i; + + for (i = 0; i < nblks; i++) { + memcpy(&keystream[i * SM4_BLOCK_SIZE], + walk.iv, SM4_BLOCK_SIZE); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + } + sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream, + keystream, nblks); + + crypto_xor_cpy(dst, src, keystream, + nblks * SM4_BLOCK_SIZE); + dst += nblks * SM4_BLOCK_SIZE; + src += nblks * SM4_BLOCK_SIZE; + nbytes -= nblks * SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static struct skcipher_alg sm4_algs[] = { + { + .base = { + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-neon", + .cra_priority = 200, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_ecb_encrypt, + .decrypt = sm4_ecb_decrypt, + }, { + .base = { + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-neon", + .cra_priority = 200, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_cbc_encrypt, + .decrypt = sm4_cbc_decrypt, + }, { + .base = { + .cra_name = "cfb(sm4)", + .cra_driver_name = "cfb-sm4-neon", + .cra_priority = 200, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_cfb_encrypt, + .decrypt = sm4_cfb_decrypt, + }, { + .base = { + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-neon", + .cra_priority = 200, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .setkey = sm4_setkey, + .encrypt = sm4_ctr_crypt, + .decrypt = sm4_ctr_crypt, + } +}; + +static int __init sm4_init(void) +{ + return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); +} + +static void __exit sm4_exit(void) +{ + crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); +} + +module_init(sm4_init); +module_exit(sm4_exit); + +MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 NEON"); +MODULE_ALIAS_CRYPTO("sm4-neon"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("ecb(sm4)"); +MODULE_ALIAS_CRYPTO("cbc(sm4)"); +MODULE_ALIAS_CRYPTO("cfb(sm4)"); +MODULE_ALIAS_CRYPTO("ctr(sm4)"); +MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); +MODULE_LICENSE("GPL v2"); |