diff options
Diffstat (limited to 'comm/third_party/libgcrypt/cipher/cast5-arm.S')
-rw-r--r-- | comm/third_party/libgcrypt/cipher/cast5-arm.S | 728 |
1 files changed, 728 insertions, 0 deletions
diff --git a/comm/third_party/libgcrypt/cipher/cast5-arm.S b/comm/third_party/libgcrypt/cipher/cast5-arm.S new file mode 100644 index 0000000000..76ddd2e335 --- /dev/null +++ b/comm/third_party/libgcrypt/cipher/cast5-arm.S @@ -0,0 +1,728 @@ +/* cast5-arm.S - ARM assembly implementation of CAST5 cipher + * + * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#if defined(__ARMEL__) +#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS + +.text + +.syntax unified +.arm + +.extern _gcry_cast5_s1to4; + +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + +/* structure of crypto context */ +#define Km 0 +#define Kr (Km + (16 * 4)) +#define Kr_arm_enc (Kr + (16)) +#define Kr_arm_dec (Kr_arm_enc + (16)) + +/* register macros */ +#define CTX %r0 +#define Rs1 %r7 +#define Rs2 %r8 +#define Rs3 %r9 +#define Rs4 %r10 +#define RMASK %r11 +#define RKM %r1 +#define RKR %r2 + +#define RL0 %r3 +#define RR0 %r4 + +#define RL1 %r9 +#define RR1 %r10 + +#define RT0 %lr +#define RT1 %ip +#define RT2 %r5 +#define RT3 %r6 + +/* helper macros */ +#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 0)]; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 3)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 0)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 1)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 2)]; \ + strb rtmp0, [rdst, #((offs) + 3)]; + +#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 3)]; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 0)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 3)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 2)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 1)]; \ + strb rtmp0, [rdst, #((offs) + 0)]; + +#ifdef __ARMEL__ + #define ldr_unaligned_host ldr_unaligned_le + #define str_unaligned_host str_unaligned_le + + /* bswap on little-endian */ +#ifdef HAVE_ARM_ARCH_V6 + #define host_to_be(reg, rtmp) \ + rev reg, reg; + #define be_to_host(reg, rtmp) \ + rev reg, reg; +#else + #define host_to_be(reg, rtmp) \ + eor rtmp, reg, reg, ror #16; \ + mov rtmp, rtmp, lsr #8; \ + bic rtmp, rtmp, #65280; \ + eor reg, rtmp, reg, ror #8; + #define be_to_host(reg, rtmp) \ + eor rtmp, reg, reg, ror #16; \ + mov rtmp, rtmp, lsr #8; \ + bic rtmp, rtmp, #65280; \ + eor reg, rtmp, reg, ror #8; +#endif +#else + #define ldr_unaligned_host ldr_unaligned_be + #define str_unaligned_host str_unaligned_be + + /* nop on big-endian */ + #define host_to_be(reg, rtmp) /*_*/ + #define be_to_host(reg, rtmp) /*_*/ +#endif + +#define host_to_host(x, y) /*_*/ + +/********************************************************************** + 1-way cast5 + **********************************************************************/ + +#define dummy(n) /*_*/ + +#define load_kr(n) \ + ldr RKR, [CTX, #(Kr_arm_enc + (n))]; /* Kr[n] */ + +#define load_dec_kr(n) \ + ldr RKR, [CTX, #(Kr_arm_dec + (n) - 3)]; /* Kr[n] */ + +#define load_km(n) \ + ldr RKM, [CTX, #(Km + (n) * 4)]; /* Km[n] */ + +#define shift_kr(dummy) \ + mov RKR, RKR, lsr #8; + +#define F(n, rl, rr, op1, op2, op3, op4, dec, loadkm, shiftkr, loadkr) \ + op1 RKM, rr; \ + mov RKM, RKM, ror RKR; \ + \ + and RT0, RMASK, RKM, ror #(24); \ + and RT1, RMASK, RKM, lsr #(16); \ + and RT2, RMASK, RKM, lsr #(8); \ + ldr RT0, [Rs1, RT0]; \ + and RT3, RMASK, RKM; \ + ldr RT1, [Rs2, RT1]; \ + shiftkr(RKR); \ + \ + ldr RT2, [Rs3, RT2]; \ + \ + op2 RT0, RT1; \ + ldr RT3, [Rs4, RT3]; \ + op3 RT0, RT2; \ + loadkm((n) + (1 - ((dec) * 2))); \ + op4 RT0, RT3; \ + loadkr((n) + (1 - ((dec) * 2))); \ + eor rl, RT0; + +#define F1(n, rl, rr, dec, loadkm, shiftkr, loadkr) \ + F(n, rl, rr, add, eor, sub, add, dec, loadkm, shiftkr, loadkr) +#define F2(n, rl, rr, dec, loadkm, shiftkr, loadkr) \ + F(n, rl, rr, eor, sub, add, eor, dec, loadkm, shiftkr, loadkr) +#define F3(n, rl, rr, dec, loadkm, shiftkr, loadkr) \ + F(n, rl, rr, sub, add, eor, sub, dec, loadkm, shiftkr, loadkr) + +#define enc_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \ + Fx(n, rl, rr, 0, loadkm, shiftkr, loadkr) + +#define dec_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \ + Fx(n, rl, rr, 1, loadkm, shiftkr, loadkr) + +#define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \ + ldr l0, [rin, #((offs) + 0)]; \ + ldr r0, [rin, #((offs) + 4)]; \ + convert(l0, rtmp); \ + convert(r0, rtmp); + +#define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \ + convert(l0, rtmp); \ + convert(r0, rtmp); \ + str l0, [rout, #((offs) + 0)]; \ + str r0, [rout, #((offs) + 4)]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads allowed */ + #define read_block(rin, offs, l0, r0, rtmp0) \ + read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0) + + #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \ + write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0) + + #define read_block_host(rin, offs, l0, r0, rtmp0) \ + read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0) + + #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \ + write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0) +#else + /* need to handle unaligned reads by byte reads */ + #define read_block(rin, offs, l0, r0, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \ + ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \ + b 2f; \ + 1:;\ + read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \ + 2:; + + #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \ + str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \ + 2:; + + #define read_block_host(rin, offs, l0, r0, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \ + ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \ + b 2f; \ + 1:;\ + read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \ + 2:; + + #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \ + str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block_aligned(rout, offs, l0, r0, host_to_host, rtmp0); \ + 2:; +#endif + +.align 3 +.globl _gcry_cast5_arm_encrypt_block +.type _gcry_cast5_arm_encrypt_block,%function; + +_gcry_cast5_arm_encrypt_block: + /* input: + * %r0: CTX + * %r1: dst + * %r2: src + */ + push {%r1, %r4-%r11, %ip, %lr}; + + GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); + mov RMASK, #(0xff << 2); + add Rs2, Rs1, #(0x100*4); + add Rs3, Rs1, #(0x100*4*2); + add Rs4, Rs1, #(0x100*4*3); + + read_block(%r2, 0, RL0, RR0, RT0); + + load_km(0); + load_kr(0); + enc_round(0, F1, RL0, RR0, load_km, shift_kr, dummy); + enc_round(1, F2, RR0, RL0, load_km, shift_kr, dummy); + enc_round(2, F3, RL0, RR0, load_km, shift_kr, dummy); + enc_round(3, F1, RR0, RL0, load_km, dummy, load_kr); + enc_round(4, F2, RL0, RR0, load_km, shift_kr, dummy); + enc_round(5, F3, RR0, RL0, load_km, shift_kr, dummy); + enc_round(6, F1, RL0, RR0, load_km, shift_kr, dummy); + enc_round(7, F2, RR0, RL0, load_km, dummy, load_kr); + enc_round(8, F3, RL0, RR0, load_km, shift_kr, dummy); + enc_round(9, F1, RR0, RL0, load_km, shift_kr, dummy); + enc_round(10, F2, RL0, RR0, load_km, shift_kr, dummy); + enc_round(11, F3, RR0, RL0, load_km, dummy, load_kr); + enc_round(12, F1, RL0, RR0, load_km, shift_kr, dummy); + enc_round(13, F2, RR0, RL0, load_km, shift_kr, dummy); + enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy); + enc_round(15, F1, RR0, RL0, dummy, dummy, dummy); + + ldr %r1, [%sp], #4; + write_block(%r1, 0, RR0, RL0, RT0, RT1); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_arm_encrypt_block,.-_gcry_cast5_arm_encrypt_block; + +.align 3 +.globl _gcry_cast5_arm_decrypt_block +.type _gcry_cast5_arm_decrypt_block,%function; + +_gcry_cast5_arm_decrypt_block: + /* input: + * %r0: CTX + * %r1: dst + * %r2: src + */ + push {%r1, %r4-%r11, %ip, %lr}; + + GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); + mov RMASK, #(0xff << 2); + add Rs2, Rs1, #(0x100 * 4); + add Rs3, Rs1, #(0x100 * 4 * 2); + add Rs4, Rs1, #(0x100 * 4 * 3); + + read_block(%r2, 0, RL0, RR0, RT0); + + load_km(15); + load_dec_kr(15); + dec_round(15, F1, RL0, RR0, load_km, shift_kr, dummy); + dec_round(14, F3, RR0, RL0, load_km, shift_kr, dummy); + dec_round(13, F2, RL0, RR0, load_km, shift_kr, dummy); + dec_round(12, F1, RR0, RL0, load_km, dummy, load_dec_kr); + dec_round(11, F3, RL0, RR0, load_km, shift_kr, dummy); + dec_round(10, F2, RR0, RL0, load_km, shift_kr, dummy); + dec_round(9, F1, RL0, RR0, load_km, shift_kr, dummy); + dec_round(8, F3, RR0, RL0, load_km, dummy, load_dec_kr); + dec_round(7, F2, RL0, RR0, load_km, shift_kr, dummy); + dec_round(6, F1, RR0, RL0, load_km, shift_kr, dummy); + dec_round(5, F3, RL0, RR0, load_km, shift_kr, dummy); + dec_round(4, F2, RR0, RL0, load_km, dummy, load_dec_kr); + dec_round(3, F1, RL0, RR0, load_km, shift_kr, dummy); + dec_round(2, F3, RR0, RL0, load_km, shift_kr, dummy); + dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy); + dec_round(0, F1, RR0, RL0, dummy, dummy, dummy); + + ldr %r1, [%sp], #4; + write_block(%r1, 0, RR0, RL0, RT0, RT1); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_arm_decrypt_block,.-_gcry_cast5_arm_decrypt_block; + +/********************************************************************** + 2-way cast5 + **********************************************************************/ + +#define F_2w(n, rl0, rr0, rl1, rr1, op1, op2, op3, op4, dec, loadkm, shiftkr, \ + loadkr) \ + op1 RT3, RKM, rr0; \ + op1 RKM, RKM, rr1; \ + mov RT3, RT3, ror RKR; \ + mov RKM, RKM, ror RKR; \ + \ + and RT0, RMASK, RT3, ror #(24); \ + and RT1, RMASK, RT3, lsr #(16); \ + and RT2, RMASK, RT3, lsr #(8); \ + and RT3, RMASK, RT3; \ + \ + ldr RT0, [Rs1, RT0]; \ + add RT2, #(0x100 * 4); \ + ldr RT1, [Rs2, RT1]; \ + add RT3, #(0x100 * 4 * 2); \ + \ + ldr RT2, [Rs2, RT2]; \ + \ + op2 RT0, RT1; \ + ldr RT3, [Rs2, RT3]; \ + and RT1, RMASK, RKM, ror #(24); \ + op3 RT0, RT2; \ + and RT2, RMASK, RKM, lsr #(16); \ + op4 RT0, RT3; \ + and RT3, RMASK, RKM, lsr #(8); \ + eor rl0, RT0; \ + add RT3, #(0x100 * 4); \ + ldr RT1, [Rs1, RT1]; \ + and RT0, RMASK, RKM; \ + ldr RT2, [Rs2, RT2]; \ + add RT0, #(0x100 * 4 * 2); \ + \ + ldr RT3, [Rs2, RT3]; \ + \ + op2 RT1, RT2; \ + ldr RT0, [Rs2, RT0]; \ + op3 RT1, RT3; \ + loadkm((n) + (1 - ((dec) * 2))); \ + op4 RT1, RT0; \ + loadkr((n) + (1 - ((dec) * 2))); \ + shiftkr(RKR); \ + eor rl1, RT1; + +#define F1_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \ + F_2w(n, rl0, rr0, rl1, rr1, add, eor, sub, add, dec, \ + loadkm, shiftkr, loadkr) +#define F2_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \ + F_2w(n, rl0, rr0, rl1, rr1, eor, sub, add, eor, dec, \ + loadkm, shiftkr, loadkr) +#define F3_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \ + F_2w(n, rl0, rr0, rl1, rr1, sub, add, eor, sub, dec, \ + loadkm, shiftkr, loadkr) + +#define enc_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \ + Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 0, loadkm, shiftkr, loadkr) + +#define dec_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \ + Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 1, loadkm, shiftkr, loadkr) + +#define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \ + ldr l0, [rin, #(0)]; \ + ldr r0, [rin, #(4)]; \ + convert(l0, rtmp); \ + ldr l1, [rin, #(8)]; \ + convert(r0, rtmp); \ + ldr r1, [rin, #(12)]; \ + convert(l1, rtmp); \ + convert(r1, rtmp); + +#define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \ + convert(l0, rtmp); \ + convert(r0, rtmp); \ + convert(l1, rtmp); \ + str l0, [rout, #(0)]; \ + convert(r1, rtmp); \ + str r0, [rout, #(4)]; \ + str l1, [rout, #(8)]; \ + str r1, [rout, #(12)]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads allowed */ + #define read_block2(rin, l0, r0, l1, r1, rtmp0) \ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0) + + #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0) + + #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0) + + #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0) +#else + /* need to handle unaligned reads by byte reads */ + #define read_block2(rin, l0, r0, l1, r1, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_be(l0, rin, 0, rtmp0); \ + ldr_unaligned_be(r0, rin, 4, rtmp0); \ + ldr_unaligned_be(l1, rin, 8, rtmp0); \ + ldr_unaligned_be(r1, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \ + 2:; + + #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \ + str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \ + str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \ + str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \ + 2:; + + #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_host(l0, rin, 0, rtmp0); \ + ldr_unaligned_host(r0, rin, 4, rtmp0); \ + ldr_unaligned_host(l1, rin, 8, rtmp0); \ + ldr_unaligned_host(r1, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \ + 2:; + + #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \ + str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \ + str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \ + str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \ + 2:; +#endif + +.align 3 +.type _gcry_cast5_arm_enc_blk2,%function; + +_gcry_cast5_arm_enc_blk2: + /* input: + * preloaded: CTX + * [RL0, RR0], [RL1, RR1]: src + * output: + * [RR0, RL0], [RR1, RL1]: dst + */ + push {%lr}; + + GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); + mov RMASK, #(0xff << 2); + add Rs2, Rs1, #(0x100 * 4); + + load_km(0); + load_kr(0); + enc_round2(0, F1, RL, RR, load_km, shift_kr, dummy); + enc_round2(1, F2, RR, RL, load_km, shift_kr, dummy); + enc_round2(2, F3, RL, RR, load_km, shift_kr, dummy); + enc_round2(3, F1, RR, RL, load_km, dummy, load_kr); + enc_round2(4, F2, RL, RR, load_km, shift_kr, dummy); + enc_round2(5, F3, RR, RL, load_km, shift_kr, dummy); + enc_round2(6, F1, RL, RR, load_km, shift_kr, dummy); + enc_round2(7, F2, RR, RL, load_km, dummy, load_kr); + enc_round2(8, F3, RL, RR, load_km, shift_kr, dummy); + enc_round2(9, F1, RR, RL, load_km, shift_kr, dummy); + enc_round2(10, F2, RL, RR, load_km, shift_kr, dummy); + enc_round2(11, F3, RR, RL, load_km, dummy, load_kr); + enc_round2(12, F1, RL, RR, load_km, shift_kr, dummy); + enc_round2(13, F2, RR, RL, load_km, shift_kr, dummy); + enc_round2(14, F3, RL, RR, load_km, shift_kr, dummy); + enc_round2(15, F1, RR, RL, dummy, dummy, dummy); + + host_to_be(RR0, RT0); + host_to_be(RL0, RT0); + host_to_be(RR1, RT0); + host_to_be(RL1, RT0); + + pop {%pc}; +.ltorg +.size _gcry_cast5_arm_enc_blk2,.-_gcry_cast5_arm_enc_blk2; + +.align 3 +.globl _gcry_cast5_arm_cfb_dec; +.type _gcry_cast5_arm_cfb_dec,%function; + +_gcry_cast5_arm_cfb_dec: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit) + */ + push {%r1, %r2, %r4-%r11, %ip, %lr}; + + mov %lr, %r3; + + /* Load input (iv/%r3 is aligned, src/%r2 might not be) */ + ldm %r3, {RL0, RR0}; + host_to_be(RL0, RT1); + host_to_be(RR0, RT1); + read_block(%r2, 0, RL1, RR1, %ip); + + /* Update IV, load src[1] and save to iv[0] */ + read_block_host(%r2, 8, %r5, %r6, %r7); + stm %lr, {%r5, %r6}; + + bl _gcry_cast5_arm_enc_blk2; + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r0: dst, %r1: %src */ + pop {%r0, %r1}; + + /* dst = src ^ result */ + read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr); + eor %r5, %r4; + eor %r6, %r3; + eor %r7, %r10; + eor %r8, %r9; + write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_arm_cfb_dec,.-_gcry_cast5_arm_cfb_dec; + +.align 3 +.globl _gcry_cast5_arm_ctr_enc; +.type _gcry_cast5_arm_ctr_enc,%function; + +_gcry_cast5_arm_ctr_enc: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit, big-endian) + */ + push {%r1, %r2, %r4-%r11, %ip, %lr}; + + mov %lr, %r3; + + /* Load IV (big => host endian) */ + read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT1); + + /* Construct IVs */ + adds RR1, RR0, #1; /* +1 */ + adc RL1, RL0, #0; + adds %r6, RR1, #1; /* +2 */ + adc %r5, RL1, #0; + + /* Store new IV (host => big-endian) */ + write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT1); + + bl _gcry_cast5_arm_enc_blk2; + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r0: dst, %r1: %src */ + pop {%r0, %r1}; + + /* XOR key-stream with plaintext */ + read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr); + eor %r5, %r4; + eor %r6, %r3; + eor %r7, %r10; + eor %r8, %r9; + write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_arm_ctr_enc,.-_gcry_cast5_arm_ctr_enc; + +.align 3 +.type _gcry_cast5_arm_dec_blk2,%function; + +_gcry_cast5_arm_dec_blk2: + /* input: + * preloaded: CTX + * [RL0, RR0], [RL1, RR1]: src + * output: + * [RR0, RL0], [RR1, RL1]: dst + */ + + GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); + mov RMASK, #(0xff << 2); + add Rs2, Rs1, #(0x100 * 4); + + load_km(15); + load_dec_kr(15); + dec_round2(15, F1, RL, RR, load_km, shift_kr, dummy); + dec_round2(14, F3, RR, RL, load_km, shift_kr, dummy); + dec_round2(13, F2, RL, RR, load_km, shift_kr, dummy); + dec_round2(12, F1, RR, RL, load_km, dummy, load_dec_kr); + dec_round2(11, F3, RL, RR, load_km, shift_kr, dummy); + dec_round2(10, F2, RR, RL, load_km, shift_kr, dummy); + dec_round2(9, F1, RL, RR, load_km, shift_kr, dummy); + dec_round2(8, F3, RR, RL, load_km, dummy, load_dec_kr); + dec_round2(7, F2, RL, RR, load_km, shift_kr, dummy); + dec_round2(6, F1, RR, RL, load_km, shift_kr, dummy); + dec_round2(5, F3, RL, RR, load_km, shift_kr, dummy); + dec_round2(4, F2, RR, RL, load_km, dummy, load_dec_kr); + dec_round2(3, F1, RL, RR, load_km, shift_kr, dummy); + dec_round2(2, F3, RR, RL, load_km, shift_kr, dummy); + dec_round2(1, F2, RL, RR, load_km, shift_kr, dummy); + dec_round2(0, F1, RR, RL, dummy, dummy, dummy); + + host_to_be(RR0, RT0); + host_to_be(RL0, RT0); + host_to_be(RR1, RT0); + host_to_be(RL1, RT0); + + b .Ldec_cbc_tail; +.ltorg +.size _gcry_cast5_arm_dec_blk2,.-_gcry_cast5_arm_dec_blk2; + +.align 3 +.globl _gcry_cast5_arm_cbc_dec; +.type _gcry_cast5_arm_cbc_dec,%function; + +_gcry_cast5_arm_cbc_dec: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit) + */ + push {%r1-%r11, %ip, %lr}; + + read_block2(%r2, RL0, RR0, RL1, RR1, RT0); + + /* dec_blk2 is only used by cbc_dec, jump directly in/out instead + * of function call. */ + b _gcry_cast5_arm_dec_blk2; +.Ldec_cbc_tail: + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r0: dst, %r1: %src, %r2: iv */ + pop {%r0-%r2}; + + /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */ + read_block_host(%r1, 0, %r7, %r8, %r5); + /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */ + ldm %r2, {%r5, %r6}; + + /* out[1] ^= IV+1 */ + eor %r10, %r7; + eor %r9, %r8; + /* out[0] ^= IV */ + eor %r4, %r5; + eor %r3, %r6; + + /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */ + read_block_host(%r1, 8, %r7, %r8, %r5); + /* store IV+2 to iv[0] (aligned). */ + stm %r2, {%r7, %r8}; + + /* store result to dst[0-3]. Might be unaligned. */ + write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_arm_cbc_dec,.-_gcry_cast5_arm_cbc_dec; + +#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ +#endif /*__ARM_ARCH >= 6*/ |