summaryrefslogtreecommitdiffstats
path: root/comm/third_party/botan/src/lib/block/aes
diff options
context:
space:
mode:
Diffstat (limited to 'comm/third_party/botan/src/lib/block/aes')
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes.cpp1017
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes.h131
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes_armv8/aes_armv8.cpp484
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes_armv8/info.txt12
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes_ni/aes_ni.cpp780
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes_ni/info.txt9
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes_power8/aes_power8.cpp529
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes_power8/info.txt11
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes_vperm/aes_vperm.cpp627
-rw-r--r--comm/third_party/botan/src/lib/block/aes/aes_vperm/info.txt36
-rw-r--r--comm/third_party/botan/src/lib/block/aes/info.txt3
11 files changed, 3639 insertions, 0 deletions
diff --git a/comm/third_party/botan/src/lib/block/aes/aes.cpp b/comm/third_party/botan/src/lib/block/aes/aes.cpp
new file mode 100644
index 0000000000..88d6e9027f
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes.cpp
@@ -0,0 +1,1017 @@
+/*
+* (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/aes.h>
+#include <botan/loadstor.h>
+#include <botan/cpuid.h>
+#include <botan/rotate.h>
+#include <botan/internal/bit_ops.h>
+#include <botan/internal/ct_utils.h>
+
+namespace Botan {
+
+#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI)
+ #define BOTAN_HAS_HW_AES_SUPPORT
+#endif
+
+/*
+* One of three AES implementation strategies are used to get a constant time
+* implementation which is immune to common cache/timing based side channels:
+*
+* - If AES hardware support is available (AES-NI, POWER8, Aarch64) use that
+*
+* - If 128-bit SIMD with byte shuffles are available (SSSE3, NEON, or Altivec),
+* use the vperm technique published by Mike Hamburg at CHES 2009.
+*
+* - If no hardware or SIMD support, fall back to a constant time bitsliced
+* implementation. This uses 32-bit words resulting in 2 blocks being processed
+* in parallel. Moving to 4 blocks (with 64-bit words) would approximately
+* double performance on 64-bit CPUs. Likewise moving to 128 bit SIMD would
+* again approximately double performance vs 64-bit. However the assumption is
+* that most 64-bit CPUs either have hardware AES or SIMD shuffle support and
+* that the majority of users falling back to this code will be 32-bit cores.
+* If this assumption proves to be unsound, the bitsliced code can easily be
+* extended to operate on either 32 or 64 bit words depending on the native
+* wordsize of the target processor.
+*
+* Useful references
+*
+* - "Accelerating AES with Vector Permute Instructions" Mike Hamburg
+* https://www.shiftleft.org/papers/vector_aes/vector_aes.pdf
+*
+* - "Faster and Timing-Attack Resistant AES-GCM" Käsper and Schwabe
+* https://eprint.iacr.org/2009/129.pdf
+*
+* - "A new combinational logic minimization technique with applications to cryptology."
+* Boyar and Peralta https://eprint.iacr.org/2009/191.pdf
+*
+* - "A depth-16 circuit for the AES S-box" Boyar and Peralta
+* https://eprint.iacr.org/2011/332.pdf
+*
+* - "A Very Compact S-box for AES" Canright
+* https://www.iacr.org/archive/ches2005/032.pdf
+* https://core.ac.uk/download/pdf/36694529.pdf (extended)
+*/
+
+namespace {
+
+/*
+This is an AES sbox circuit which can execute in bitsliced mode up to 32x in
+parallel.
+
+The circuit is from the "Circuit Minimization Team" group
+http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
+http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt
+
+This circuit has size 113 and depth 27. In software it is much faster than
+circuits which are considered faster for hardware purposes (where circuit depth
+is the critical constraint), because unlike in hardware, on common CPUs we can
+only execute - at best - 3 or 4 logic operations per cycle. So a smaller circuit
+is superior. On an x86-64 machine this circuit is about 15% faster than the
+circuit of size 128 and depth 16 given in "A depth-16 circuit for the AES S-box".
+
+Another circuit for AES Sbox of size 102 and depth 24 is describted in "New
+Circuit Minimization Techniques for Smaller and Faster AES SBoxes"
+[https://eprint.iacr.org/2019/802] however it relies on "non-standard" gates
+like MUX, NOR, NAND, etc and so in practice in bitsliced software, its size is
+actually a bit larger than this circuit, as few CPUs have such instructions and
+otherwise they must be emulated using a sequence of available bit operations.
+*/
+void AES_SBOX(uint32_t V[8])
+ {
+ const uint32_t U0 = V[0];
+ const uint32_t U1 = V[1];
+ const uint32_t U2 = V[2];
+ const uint32_t U3 = V[3];
+ const uint32_t U4 = V[4];
+ const uint32_t U5 = V[5];
+ const uint32_t U6 = V[6];
+ const uint32_t U7 = V[7];
+
+ const uint32_t y14 = U3 ^ U5;
+ const uint32_t y13 = U0 ^ U6;
+ const uint32_t y9 = U0 ^ U3;
+ const uint32_t y8 = U0 ^ U5;
+ const uint32_t t0 = U1 ^ U2;
+ const uint32_t y1 = t0 ^ U7;
+ const uint32_t y4 = y1 ^ U3;
+ const uint32_t y12 = y13 ^ y14;
+ const uint32_t y2 = y1 ^ U0;
+ const uint32_t y5 = y1 ^ U6;
+ const uint32_t y3 = y5 ^ y8;
+ const uint32_t t1 = U4 ^ y12;
+ const uint32_t y15 = t1 ^ U5;
+ const uint32_t y20 = t1 ^ U1;
+ const uint32_t y6 = y15 ^ U7;
+ const uint32_t y10 = y15 ^ t0;
+ const uint32_t y11 = y20 ^ y9;
+ const uint32_t y7 = U7 ^ y11;
+ const uint32_t y17 = y10 ^ y11;
+ const uint32_t y19 = y10 ^ y8;
+ const uint32_t y16 = t0 ^ y11;
+ const uint32_t y21 = y13 ^ y16;
+ const uint32_t y18 = U0 ^ y16;
+ const uint32_t t2 = y12 & y15;
+ const uint32_t t3 = y3 & y6;
+ const uint32_t t4 = t3 ^ t2;
+ const uint32_t t5 = y4 & U7;
+ const uint32_t t6 = t5 ^ t2;
+ const uint32_t t7 = y13 & y16;
+ const uint32_t t8 = y5 & y1;
+ const uint32_t t9 = t8 ^ t7;
+ const uint32_t t10 = y2 & y7;
+ const uint32_t t11 = t10 ^ t7;
+ const uint32_t t12 = y9 & y11;
+ const uint32_t t13 = y14 & y17;
+ const uint32_t t14 = t13 ^ t12;
+ const uint32_t t15 = y8 & y10;
+ const uint32_t t16 = t15 ^ t12;
+ const uint32_t t17 = t4 ^ y20;
+ const uint32_t t18 = t6 ^ t16;
+ const uint32_t t19 = t9 ^ t14;
+ const uint32_t t20 = t11 ^ t16;
+ const uint32_t t21 = t17 ^ t14;
+ const uint32_t t22 = t18 ^ y19;
+ const uint32_t t23 = t19 ^ y21;
+ const uint32_t t24 = t20 ^ y18;
+ const uint32_t t25 = t21 ^ t22;
+ const uint32_t t26 = t21 & t23;
+ const uint32_t t27 = t24 ^ t26;
+ const uint32_t t28 = t25 & t27;
+ const uint32_t t29 = t28 ^ t22;
+ const uint32_t t30 = t23 ^ t24;
+ const uint32_t t31 = t22 ^ t26;
+ const uint32_t t32 = t31 & t30;
+ const uint32_t t33 = t32 ^ t24;
+ const uint32_t t34 = t23 ^ t33;
+ const uint32_t t35 = t27 ^ t33;
+ const uint32_t t36 = t24 & t35;
+ const uint32_t t37 = t36 ^ t34;
+ const uint32_t t38 = t27 ^ t36;
+ const uint32_t t39 = t29 & t38;
+ const uint32_t t40 = t25 ^ t39;
+ const uint32_t t41 = t40 ^ t37;
+ const uint32_t t42 = t29 ^ t33;
+ const uint32_t t43 = t29 ^ t40;
+ const uint32_t t44 = t33 ^ t37;
+ const uint32_t t45 = t42 ^ t41;
+ const uint32_t z0 = t44 & y15;
+ const uint32_t z1 = t37 & y6;
+ const uint32_t z2 = t33 & U7;
+ const uint32_t z3 = t43 & y16;
+ const uint32_t z4 = t40 & y1;
+ const uint32_t z5 = t29 & y7;
+ const uint32_t z6 = t42 & y11;
+ const uint32_t z7 = t45 & y17;
+ const uint32_t z8 = t41 & y10;
+ const uint32_t z9 = t44 & y12;
+ const uint32_t z10 = t37 & y3;
+ const uint32_t z11 = t33 & y4;
+ const uint32_t z12 = t43 & y13;
+ const uint32_t z13 = t40 & y5;
+ const uint32_t z14 = t29 & y2;
+ const uint32_t z15 = t42 & y9;
+ const uint32_t z16 = t45 & y14;
+ const uint32_t z17 = t41 & y8;
+ const uint32_t tc1 = z15 ^ z16;
+ const uint32_t tc2 = z10 ^ tc1;
+ const uint32_t tc3 = z9 ^ tc2;
+ const uint32_t tc4 = z0 ^ z2;
+ const uint32_t tc5 = z1 ^ z0;
+ const uint32_t tc6 = z3 ^ z4;
+ const uint32_t tc7 = z12 ^ tc4;
+ const uint32_t tc8 = z7 ^ tc6;
+ const uint32_t tc9 = z8 ^ tc7;
+ const uint32_t tc10 = tc8 ^ tc9;
+ const uint32_t tc11 = tc6 ^ tc5;
+ const uint32_t tc12 = z3 ^ z5;
+ const uint32_t tc13 = z13 ^ tc1;
+ const uint32_t tc14 = tc4 ^ tc12;
+ const uint32_t S3 = tc3 ^ tc11;
+ const uint32_t tc16 = z6 ^ tc8;
+ const uint32_t tc17 = z14 ^ tc10;
+ const uint32_t tc18 = ~tc13 ^ tc14;
+ const uint32_t S7 = z12 ^ tc18;
+ const uint32_t tc20 = z15 ^ tc16;
+ const uint32_t tc21 = tc2 ^ z11;
+ const uint32_t S0 = tc3 ^ tc16;
+ const uint32_t S6 = tc10 ^ tc18;
+ const uint32_t S4 = tc14 ^ S3;
+ const uint32_t S1 = ~(S3 ^ tc16);
+ const uint32_t tc26 = tc17 ^ tc20;
+ const uint32_t S2 = ~(tc26 ^ z17);
+ const uint32_t S5 = tc21 ^ tc17;
+
+ V[0] = S0;
+ V[1] = S1;
+ V[2] = S2;
+ V[3] = S3;
+ V[4] = S4;
+ V[5] = S5;
+ V[6] = S6;
+ V[7] = S7;
+ }
+
+/*
+A circuit for inverse AES Sbox of size 121 and depth 21 from
+http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
+http://www.cs.yale.edu/homes/peralta/CircuitStuff/Sinv.txt
+*/
+void AES_INV_SBOX(uint32_t V[8])
+ {
+ const uint32_t U0 = V[0];
+ const uint32_t U1 = V[1];
+ const uint32_t U2 = V[2];
+ const uint32_t U3 = V[3];
+ const uint32_t U4 = V[4];
+ const uint32_t U5 = V[5];
+ const uint32_t U6 = V[6];
+ const uint32_t U7 = V[7];
+
+ const uint32_t Y0 = U0 ^ U3;
+ const uint32_t Y2 = ~(U1 ^ U3);
+ const uint32_t Y4 = U0 ^ Y2;
+ const uint32_t RTL0 = U6 ^ U7;
+ const uint32_t Y1 = Y2 ^ RTL0;
+ const uint32_t Y7 = ~(U2 ^ Y1);
+ const uint32_t RTL1 = U3 ^ U4;
+ const uint32_t Y6 = ~(U7 ^ RTL1);
+ const uint32_t Y3 = Y1 ^ RTL1;
+ const uint32_t RTL2 = ~(U0 ^ U2);
+ const uint32_t Y5 = U5 ^ RTL2;
+ const uint32_t sa1 = Y0 ^ Y2;
+ const uint32_t sa0 = Y1 ^ Y3;
+ const uint32_t sb1 = Y4 ^ Y6;
+ const uint32_t sb0 = Y5 ^ Y7;
+ const uint32_t ah = Y0 ^ Y1;
+ const uint32_t al = Y2 ^ Y3;
+ const uint32_t aa = sa0 ^ sa1;
+ const uint32_t bh = Y4 ^ Y5;
+ const uint32_t bl = Y6 ^ Y7;
+ const uint32_t bb = sb0 ^ sb1;
+ const uint32_t ab20 = sa0 ^ sb0;
+ const uint32_t ab22 = al ^ bl;
+ const uint32_t ab23 = Y3 ^ Y7;
+ const uint32_t ab21 = sa1 ^ sb1;
+ const uint32_t abcd1 = ah & bh;
+ const uint32_t rr1 = Y0 & Y4;
+ const uint32_t ph11 = ab20 ^ abcd1;
+ const uint32_t t01 = Y1 & Y5;
+ const uint32_t ph01 = t01 ^ abcd1;
+ const uint32_t abcd2 = al & bl;
+ const uint32_t r1 = Y2 & Y6;
+ const uint32_t pl11 = ab22 ^ abcd2;
+ const uint32_t r2 = Y3 & Y7;
+ const uint32_t pl01 = r2 ^ abcd2;
+ const uint32_t r3 = sa0 & sb0;
+ const uint32_t vr1 = aa & bb;
+ const uint32_t pr1 = vr1 ^ r3;
+ const uint32_t wr1 = sa1 & sb1;
+ const uint32_t qr1 = wr1 ^ r3;
+ const uint32_t ab0 = ph11 ^ rr1;
+ const uint32_t ab1 = ph01 ^ ab21;
+ const uint32_t ab2 = pl11 ^ r1;
+ const uint32_t ab3 = pl01 ^ qr1;
+ const uint32_t cp1 = ab0 ^ pr1;
+ const uint32_t cp2 = ab1 ^ qr1;
+ const uint32_t cp3 = ab2 ^ pr1;
+ const uint32_t cp4 = ab3 ^ ab23;
+ const uint32_t tinv1 = cp3 ^ cp4;
+ const uint32_t tinv2 = cp3 & cp1;
+ const uint32_t tinv3 = cp2 ^ tinv2;
+ const uint32_t tinv4 = cp1 ^ cp2;
+ const uint32_t tinv5 = cp4 ^ tinv2;
+ const uint32_t tinv6 = tinv5 & tinv4;
+ const uint32_t tinv7 = tinv3 & tinv1;
+ const uint32_t d2 = cp4 ^ tinv7;
+ const uint32_t d0 = cp2 ^ tinv6;
+ const uint32_t tinv8 = cp1 & cp4;
+ const uint32_t tinv9 = tinv4 & tinv8;
+ const uint32_t tinv10 = tinv4 ^ tinv2;
+ const uint32_t d1 = tinv9 ^ tinv10;
+ const uint32_t tinv11 = cp2 & cp3;
+ const uint32_t tinv12 = tinv1 & tinv11;
+ const uint32_t tinv13 = tinv1 ^ tinv2;
+ const uint32_t d3 = tinv12 ^ tinv13;
+ const uint32_t sd1 = d1 ^ d3;
+ const uint32_t sd0 = d0 ^ d2;
+ const uint32_t dl = d0 ^ d1;
+ const uint32_t dh = d2 ^ d3;
+ const uint32_t dd = sd0 ^ sd1;
+ const uint32_t abcd3 = dh & bh;
+ const uint32_t rr2 = d3 & Y4;
+ const uint32_t t02 = d2 & Y5;
+ const uint32_t abcd4 = dl & bl;
+ const uint32_t r4 = d1 & Y6;
+ const uint32_t r5 = d0 & Y7;
+ const uint32_t r6 = sd0 & sb0;
+ const uint32_t vr2 = dd & bb;
+ const uint32_t wr2 = sd1 & sb1;
+ const uint32_t abcd5 = dh & ah;
+ const uint32_t r7 = d3 & Y0;
+ const uint32_t r8 = d2 & Y1;
+ const uint32_t abcd6 = dl & al;
+ const uint32_t r9 = d1 & Y2;
+ const uint32_t r10 = d0 & Y3;
+ const uint32_t r11 = sd0 & sa0;
+ const uint32_t vr3 = dd & aa;
+ const uint32_t wr3 = sd1 & sa1;
+ const uint32_t ph12 = rr2 ^ abcd3;
+ const uint32_t ph02 = t02 ^ abcd3;
+ const uint32_t pl12 = r4 ^ abcd4;
+ const uint32_t pl02 = r5 ^ abcd4;
+ const uint32_t pr2 = vr2 ^ r6;
+ const uint32_t qr2 = wr2 ^ r6;
+ const uint32_t p0 = ph12 ^ pr2;
+ const uint32_t p1 = ph02 ^ qr2;
+ const uint32_t p2 = pl12 ^ pr2;
+ const uint32_t p3 = pl02 ^ qr2;
+ const uint32_t ph13 = r7 ^ abcd5;
+ const uint32_t ph03 = r8 ^ abcd5;
+ const uint32_t pl13 = r9 ^ abcd6;
+ const uint32_t pl03 = r10 ^ abcd6;
+ const uint32_t pr3 = vr3 ^ r11;
+ const uint32_t qr3 = wr3 ^ r11;
+ const uint32_t p4 = ph13 ^ pr3;
+ const uint32_t S7 = ph03 ^ qr3;
+ const uint32_t p6 = pl13 ^ pr3;
+ const uint32_t p7 = pl03 ^ qr3;
+ const uint32_t S3 = p1 ^ p6;
+ const uint32_t S6 = p2 ^ p6;
+ const uint32_t S0 = p3 ^ p6;
+ const uint32_t X11 = p0 ^ p2;
+ const uint32_t S5 = S0 ^ X11;
+ const uint32_t X13 = p4 ^ p7;
+ const uint32_t X14 = X11 ^ X13;
+ const uint32_t S1 = S3 ^ X14;
+ const uint32_t X16 = p1 ^ S7;
+ const uint32_t S2 = X14 ^ X16;
+ const uint32_t X18 = p0 ^ p4;
+ const uint32_t X19 = S5 ^ X16;
+ const uint32_t S4 = X18 ^ X19;
+
+ V[0] = S0;
+ V[1] = S1;
+ V[2] = S2;
+ V[3] = S3;
+ V[4] = S4;
+ V[5] = S5;
+ V[6] = S6;
+ V[7] = S7;
+ }
+
+inline void bit_transpose(uint32_t B[8])
+ {
+ swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
+ swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
+ swap_bits<uint32_t>(B[5], B[4], 0x55555555, 1);
+ swap_bits<uint32_t>(B[7], B[6], 0x55555555, 1);
+
+ swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
+ swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
+ swap_bits<uint32_t>(B[6], B[4], 0x33333333, 2);
+ swap_bits<uint32_t>(B[7], B[5], 0x33333333, 2);
+
+ swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
+ }
+
+inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r)
+ {
+ /*
+ This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation
+ due to knowing the first and second halves are the same data.
+ */
+ for(size_t i = 0; i != 4; ++i)
+ B[i] = K[r + i];
+
+ swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
+ swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
+
+ swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
+ swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
+
+ B[4] = B[0];
+ B[5] = B[1];
+ B[6] = B[2];
+ B[7] = B[3];
+
+ swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
+ swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
+ }
+
+inline void shift_rows(uint32_t B[8])
+ {
+ // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31
+#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
+ for(size_t i = 0; i != 8; i += 2)
+ {
+ uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i+1];
+ x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
+ x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
+ B[i] = static_cast<uint32_t>(x >> 32);
+ B[i+1] = static_cast<uint32_t>(x);
+ }
+#else
+ for(size_t i = 0; i != 8; ++i)
+ {
+ uint32_t x = B[i];
+ x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
+ x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
+ B[i] = x;
+ }
+#endif
+ }
+
+inline void inv_shift_rows(uint32_t B[8])
+ {
+ // Inverse of shift_rows, just inverting the steps
+
+#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
+ for(size_t i = 0; i != 8; i += 2)
+ {
+ uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i+1];
+ x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
+ x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
+ B[i] = static_cast<uint32_t>(x >> 32);
+ B[i+1] = static_cast<uint32_t>(x);
+ }
+#else
+ for(size_t i = 0; i != 8; ++i)
+ {
+ uint32_t x = B[i];
+ x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
+ x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
+ B[i] = x;
+ }
+#endif
+ }
+
+inline void mix_columns(uint32_t B[8])
+ {
+ // carry high bits in B[0] to positions in 0x1b == 0b11011
+ const uint32_t X2[8] = {
+ B[1],
+ B[2],
+ B[3],
+ B[4] ^ B[0],
+ B[5] ^ B[0],
+ B[6],
+ B[7] ^ B[0],
+ B[0],
+ };
+
+ for(size_t i = 0; i != 8; i++)
+ {
+ const uint32_t X3 = B[i] ^ X2[i];
+ B[i] = X2[i] ^ rotr<8>(B[i]) ^ rotr<16>(B[i]) ^ rotr<24>(X3);
+ }
+ }
+
+void inv_mix_columns(uint32_t B[8])
+ {
+ /*
+ OpenSSL's bsaes implementation credits Jussi Kivilinna with the lovely
+ matrix decomposition
+
+ | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+ | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+ | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+ | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+ Notice the first component is simply the MixColumns matrix. So we can
+ multiply first by (05,00,04,00) then perform MixColumns to get the equivalent
+ of InvMixColumn.
+ */
+ const uint32_t X4[8] = {
+ B[2],
+ B[3],
+ B[4] ^ B[0],
+ B[5] ^ B[0] ^ B[1],
+ B[6] ^ B[1],
+ B[7] ^ B[0],
+ B[0] ^ B[1],
+ B[1],
+ };
+
+ for(size_t i = 0; i != 8; i++)
+ {
+ const uint32_t X5 = X4[i] ^ B[i];
+ B[i] = X5 ^ rotr<16>(X4[i]);
+ }
+
+ mix_columns(B);
+ }
+
+/*
+* AES Encryption
+*/
+void aes_encrypt_n(const uint8_t in[], uint8_t out[],
+ size_t blocks,
+ const secure_vector<uint32_t>& EK)
+ {
+ BOTAN_ASSERT(EK.size() == 44 || EK.size() == 52 || EK.size() == 60, "Key was set");
+
+ const size_t rounds = (EK.size() - 4) / 4;
+
+ uint32_t KS[13*8] = { 0 }; // actual maximum is (rounds - 1) * 8
+ for(size_t i = 0; i < rounds - 1; i += 1)
+ {
+ ks_expand(&KS[8*i], EK.data(), 4*i + 4);
+ }
+
+ const size_t BLOCK_SIZE = 16;
+ const size_t BITSLICED_BLOCKS = 8*sizeof(uint32_t) / BLOCK_SIZE;
+
+ while(blocks > 0)
+ {
+ const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
+
+ uint32_t B[8] = { 0 };
+
+ load_be(B, in, this_loop*4);
+
+ for(size_t i = 0; i != 8; ++i)
+ B[i] ^= EK[i % 4];
+
+ bit_transpose(B);
+
+ for(size_t r = 0; r != rounds - 1; ++r)
+ {
+ AES_SBOX(B);
+ shift_rows(B);
+ mix_columns(B);
+
+ for(size_t i = 0; i != 8; ++i)
+ B[i] ^= KS[8*r + i];
+ }
+
+ // Final round:
+ AES_SBOX(B);
+ shift_rows(B);
+ bit_transpose(B);
+
+ for(size_t i = 0; i != 8; ++i)
+ B[i] ^= EK[4*rounds + i % 4];
+
+ copy_out_be(out, this_loop*4*sizeof(uint32_t), B);
+
+ in += this_loop * BLOCK_SIZE;
+ out += this_loop * BLOCK_SIZE;
+ blocks -= this_loop;
+ }
+ }
+
+/*
+* AES Decryption
+*/
+void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks,
+ const secure_vector<uint32_t>& DK)
+ {
+ BOTAN_ASSERT(DK.size() == 44 || DK.size() == 52 || DK.size() == 60, "Key was set");
+
+ const size_t rounds = (DK.size() - 4) / 4;
+
+ uint32_t KS[13*8] = { 0 }; // actual maximum is (rounds - 1) * 8
+ for(size_t i = 0; i < rounds - 1; i += 1)
+ {
+ ks_expand(&KS[8*i], DK.data(), 4*i + 4);
+ }
+
+ const size_t BLOCK_SIZE = 16;
+ const size_t BITSLICED_BLOCKS = 8*sizeof(uint32_t) / BLOCK_SIZE;
+
+ while(blocks > 0)
+ {
+ const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
+
+ uint32_t B[8] = { 0 };
+
+ load_be(B, in, this_loop*4);
+
+ for(size_t i = 0; i != 8; ++i)
+ B[i] ^= DK[i % 4];
+
+ bit_transpose(B);
+
+ for(size_t r = 0; r != rounds - 1; ++r)
+ {
+ AES_INV_SBOX(B);
+ inv_shift_rows(B);
+ inv_mix_columns(B);
+
+ for(size_t i = 0; i != 8; ++i)
+ B[i] ^= KS[8*r + i];
+ }
+
+ // Final round:
+ AES_INV_SBOX(B);
+ inv_shift_rows(B);
+ bit_transpose(B);
+
+ for(size_t i = 0; i != 8; ++i)
+ B[i] ^= DK[4*rounds + i % 4];
+
+ copy_out_be(out, this_loop*4*sizeof(uint32_t), B);
+
+ in += this_loop * BLOCK_SIZE;
+ out += this_loop * BLOCK_SIZE;
+ blocks -= this_loop;
+ }
+ }
+
+inline uint32_t xtime32(uint32_t s)
+ {
+ const uint32_t lo_bit = 0x01010101;
+ const uint32_t mask = 0x7F7F7F7F;
+ const uint32_t poly = 0x1B;
+
+ return ((s & mask) << 1) ^ (((s >> 7) & lo_bit) * poly);
+ }
+
+inline uint32_t InvMixColumn(uint32_t s1)
+ {
+ const uint32_t s2 = xtime32(s1);
+ const uint32_t s4 = xtime32(s2);
+ const uint32_t s8 = xtime32(s4);
+ const uint32_t s9 = s8 ^ s1;
+ const uint32_t s11 = s9 ^ s2;
+ const uint32_t s13 = s9 ^ s4;
+ const uint32_t s14 = s8 ^ s4 ^ s2;
+
+ return s14 ^ rotr<8>(s9) ^ rotr<16>(s13) ^ rotr<24>(s11);
+ }
+
+void InvMixColumn_x4(uint32_t x[4])
+ {
+ x[0] = InvMixColumn(x[0]);
+ x[1] = InvMixColumn(x[1]);
+ x[2] = InvMixColumn(x[2]);
+ x[3] = InvMixColumn(x[3]);
+ }
+
+uint32_t SE_word(uint32_t x)
+ {
+ uint32_t I[8] = { 0 };
+
+ for(size_t i = 0; i != 8; ++i)
+ I[i] = (x >> (7-i)) & 0x01010101;
+
+ AES_SBOX(I);
+
+ x = 0;
+
+ for(size_t i = 0; i != 8; ++i)
+ x |= ((I[i] & 0x01010101) << (7-i));
+
+ return x;
+ }
+
+void aes_key_schedule(const uint8_t key[], size_t length,
+ secure_vector<uint32_t>& EK,
+ secure_vector<uint32_t>& DK,
+ bool bswap_keys = false)
+ {
+ static const uint32_t RC[10] = {
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000,
+ 0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000 };
+
+ const size_t X = length / 4;
+
+ // Can't happen, but make static analyzers happy
+ BOTAN_ASSERT_NOMSG(X == 4 || X == 6 || X == 8);
+
+ const size_t rounds = (length / 4) + 6;
+
+ // Help the optimizer
+ BOTAN_ASSERT_NOMSG(rounds == 10 || rounds == 12 || rounds == 14);
+
+ CT::poison(key, length);
+
+ EK.resize(length + 28);
+ DK.resize(length + 28);
+
+ for(size_t i = 0; i != X; ++i)
+ EK[i] = load_be<uint32_t>(key, i);
+
+ for(size_t i = X; i < 4*(rounds+1); i += X)
+ {
+ EK[i] = EK[i-X] ^ RC[(i-X)/X] ^ rotl<8>(SE_word(EK[i-1]));
+
+ for(size_t j = 1; j != X && (i+j) < EK.size(); ++j)
+ {
+ EK[i+j] = EK[i+j-X];
+
+ if(X == 8 && j == 4)
+ EK[i+j] ^= SE_word(EK[i+j-1]);
+ else
+ EK[i+j] ^= EK[i+j-1];
+ }
+ }
+
+ for(size_t i = 0; i != 4*(rounds+1); i += 4)
+ {
+ DK[i ] = EK[4*rounds - i ];
+ DK[i+1] = EK[4*rounds - i+1];
+ DK[i+2] = EK[4*rounds - i+2];
+ DK[i+3] = EK[4*rounds - i+3];
+ }
+
+ for(size_t i = 4; i != 4*rounds; i += 4)
+ {
+ InvMixColumn_x4(&DK[i]);
+ }
+
+ if(bswap_keys)
+ {
+ // HW AES on little endian needs the subkeys to be byte reversed
+ for(size_t i = 0; i != EK.size(); ++i)
+ EK[i] = reverse_bytes(EK[i]);
+ for(size_t i = 0; i != DK.size(); ++i)
+ DK[i] = reverse_bytes(DK[i]);
+ }
+
+ CT::unpoison(EK.data(), EK.size());
+ CT::unpoison(DK.data(), DK.size());
+ CT::unpoison(key, length);
+ }
+
+size_t aes_parallelism()
+ {
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return 4; // pipelined
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return 2; // pipelined
+ }
+#endif
+
+ // bitsliced:
+ return 2;
+ }
+
+const char* aes_provider()
+ {
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return "cpu";
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return "vperm";
+ }
+#endif
+
+ return "base";
+ }
+
+}
+
+std::string AES_128::provider() const { return aes_provider(); }
+std::string AES_192::provider() const { return aes_provider(); }
+std::string AES_256::provider() const { return aes_provider(); }
+
+size_t AES_128::parallelism() const { return aes_parallelism(); }
+size_t AES_192::parallelism() const { return aes_parallelism(); }
+size_t AES_256::parallelism() const { return aes_parallelism(); }
+
+void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ verify_key_set(m_EK.empty() == false);
+
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return hw_aes_encrypt_n(in, out, blocks);
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return vperm_encrypt_n(in, out, blocks);
+ }
+#endif
+
+ aes_encrypt_n(in, out, blocks, m_EK);
+ }
+
+void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ verify_key_set(m_DK.empty() == false);
+
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return hw_aes_decrypt_n(in, out, blocks);
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return vperm_decrypt_n(in, out, blocks);
+ }
+#endif
+
+ aes_decrypt_n(in, out, blocks, m_DK);
+ }
+
+void AES_128::key_schedule(const uint8_t key[], size_t length)
+ {
+#if defined(BOTAN_HAS_AES_NI)
+ if(CPUID::has_aes_ni())
+ {
+ return aesni_key_schedule(key, length);
+ }
+#endif
+
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return aes_key_schedule(key, length, m_EK, m_DK, CPUID::is_little_endian());
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return vperm_key_schedule(key, length);
+ }
+#endif
+
+ aes_key_schedule(key, length, m_EK, m_DK);
+ }
+
+void AES_128::clear()
+ {
+ zap(m_EK);
+ zap(m_DK);
+ }
+
+void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ verify_key_set(m_EK.empty() == false);
+
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return hw_aes_encrypt_n(in, out, blocks);
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return vperm_encrypt_n(in, out, blocks);
+ }
+#endif
+
+ aes_encrypt_n(in, out, blocks, m_EK);
+ }
+
+void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ verify_key_set(m_DK.empty() == false);
+
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return hw_aes_decrypt_n(in, out, blocks);
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return vperm_decrypt_n(in, out, blocks);
+ }
+#endif
+
+ aes_decrypt_n(in, out, blocks, m_DK);
+ }
+
+void AES_192::key_schedule(const uint8_t key[], size_t length)
+ {
+#if defined(BOTAN_HAS_AES_NI)
+ if(CPUID::has_aes_ni())
+ {
+ return aesni_key_schedule(key, length);
+ }
+#endif
+
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return aes_key_schedule(key, length, m_EK, m_DK, CPUID::is_little_endian());
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return vperm_key_schedule(key, length);
+ }
+#endif
+
+ aes_key_schedule(key, length, m_EK, m_DK);
+ }
+
+void AES_192::clear()
+ {
+ zap(m_EK);
+ zap(m_DK);
+ }
+
+void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ verify_key_set(m_EK.empty() == false);
+
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return hw_aes_encrypt_n(in, out, blocks);
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return vperm_encrypt_n(in, out, blocks);
+ }
+#endif
+
+ aes_encrypt_n(in, out, blocks, m_EK);
+ }
+
+void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ verify_key_set(m_DK.empty() == false);
+
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return hw_aes_decrypt_n(in, out, blocks);
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return vperm_decrypt_n(in, out, blocks);
+ }
+#endif
+
+ aes_decrypt_n(in, out, blocks, m_DK);
+ }
+
+void AES_256::key_schedule(const uint8_t key[], size_t length)
+ {
+#if defined(BOTAN_HAS_AES_NI)
+ if(CPUID::has_aes_ni())
+ {
+ return aesni_key_schedule(key, length);
+ }
+#endif
+
+#if defined(BOTAN_HAS_HW_AES_SUPPORT)
+ if(CPUID::has_hw_aes())
+ {
+ return aes_key_schedule(key, length, m_EK, m_DK, CPUID::is_little_endian());
+ }
+#endif
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ if(CPUID::has_vperm())
+ {
+ return vperm_key_schedule(key, length);
+ }
+#endif
+
+ aes_key_schedule(key, length, m_EK, m_DK);
+ }
+
+void AES_256::clear()
+ {
+ zap(m_EK);
+ zap(m_DK);
+ }
+
+}
diff --git a/comm/third_party/botan/src/lib/block/aes/aes.h b/comm/third_party/botan/src/lib/block/aes/aes.h
new file mode 100644
index 0000000000..76248200d4
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes.h
@@ -0,0 +1,131 @@
+/*
+* AES
+* (C) 1999-2010 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#ifndef BOTAN_AES_H_
+#define BOTAN_AES_H_
+
+#include <botan/block_cipher.h>
+
+BOTAN_FUTURE_INTERNAL_HEADER(aes.h)
+
+namespace Botan {
+
+/**
+* AES-128
+*/
+class BOTAN_PUBLIC_API(2,0) AES_128 final : public Block_Cipher_Fixed_Params<16, 16>
+ {
+ public:
+ void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override;
+ void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override;
+
+ void clear() override;
+
+ std::string provider() const override;
+ std::string name() const override { return "AES-128"; }
+ BlockCipher* clone() const override { return new AES_128; }
+ size_t parallelism() const override;
+
+ private:
+ void key_schedule(const uint8_t key[], size_t length) override;
+
+#if defined(BOTAN_HAS_AES_VPERM)
+ void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+ void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+ void vperm_key_schedule(const uint8_t key[], size_t length);
+#endif
+
+#if defined(BOTAN_HAS_AES_NI)
+ void aesni_key_schedule(const uint8_t key[], size_t length);
+#endif
+
+#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI)
+ void hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+ void hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+#endif
+
+ secure_vector<uint32_t> m_EK, m_DK;
+ };
+
+/**
+* AES-192
+*/
+class BOTAN_PUBLIC_API(2,0) AES_192 final : public Block_Cipher_Fixed_Params<16, 24>
+ {
+ public:
+ void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override;
+ void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override;
+
+ void clear() override;
+
+ std::string provider() const override;
+ std::string name() const override { return "AES-192"; }
+ BlockCipher* clone() const override { return new AES_192; }
+ size_t parallelism() const override;
+
+ private:
+#if defined(BOTAN_HAS_AES_VPERM)
+ void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+ void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+ void vperm_key_schedule(const uint8_t key[], size_t length);
+#endif
+
+#if defined(BOTAN_HAS_AES_NI)
+ void aesni_key_schedule(const uint8_t key[], size_t length);
+#endif
+
+#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI)
+ void hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+ void hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+#endif
+
+ void key_schedule(const uint8_t key[], size_t length) override;
+
+ secure_vector<uint32_t> m_EK, m_DK;
+ };
+
+/**
+* AES-256
+*/
+class BOTAN_PUBLIC_API(2,0) AES_256 final : public Block_Cipher_Fixed_Params<16, 32>
+ {
+ public:
+ void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override;
+ void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override;
+
+ void clear() override;
+
+ std::string provider() const override;
+
+ std::string name() const override { return "AES-256"; }
+ BlockCipher* clone() const override { return new AES_256; }
+ size_t parallelism() const override;
+
+ private:
+#if defined(BOTAN_HAS_AES_VPERM)
+ void vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+ void vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+ void vperm_key_schedule(const uint8_t key[], size_t length);
+#endif
+
+#if defined(BOTAN_HAS_AES_NI)
+ void aesni_key_schedule(const uint8_t key[], size_t length);
+#endif
+
+#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI)
+ void hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+ void hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const;
+#endif
+
+ void key_schedule(const uint8_t key[], size_t length) override;
+
+ secure_vector<uint32_t> m_EK, m_DK;
+ };
+
+}
+
+#endif
diff --git a/comm/third_party/botan/src/lib/block/aes/aes_armv8/aes_armv8.cpp b/comm/third_party/botan/src/lib/block/aes/aes_armv8/aes_armv8.cpp
new file mode 100644
index 0000000000..9766bf88c9
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes_armv8/aes_armv8.cpp
@@ -0,0 +1,484 @@
+/*
+* AES using ARMv8
+* Contributed by Jeffrey Walton
+*
+* Further changes
+* (C) 2017,2018 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/aes.h>
+#include <botan/loadstor.h>
+#include <arm_neon.h>
+
+namespace Botan {
+
+#define AES_ENC_4_ROUNDS(K) \
+ do \
+ { \
+ B0 = vaesmcq_u8(vaeseq_u8(B0, K)); \
+ B1 = vaesmcq_u8(vaeseq_u8(B1, K)); \
+ B2 = vaesmcq_u8(vaeseq_u8(B2, K)); \
+ B3 = vaesmcq_u8(vaeseq_u8(B3, K)); \
+ } while(0)
+
+#define AES_ENC_4_LAST_ROUNDS(K, K2) \
+ do \
+ { \
+ B0 = veorq_u8(vaeseq_u8(B0, K), K2); \
+ B1 = veorq_u8(vaeseq_u8(B1, K), K2); \
+ B2 = veorq_u8(vaeseq_u8(B2, K), K2); \
+ B3 = veorq_u8(vaeseq_u8(B3, K), K2); \
+ } while(0)
+
+#define AES_DEC_4_ROUNDS(K) \
+ do \
+ { \
+ B0 = vaesimcq_u8(vaesdq_u8(B0, K)); \
+ B1 = vaesimcq_u8(vaesdq_u8(B1, K)); \
+ B2 = vaesimcq_u8(vaesdq_u8(B2, K)); \
+ B3 = vaesimcq_u8(vaesdq_u8(B3, K)); \
+ } while(0)
+
+#define AES_DEC_4_LAST_ROUNDS(K, K2) \
+ do \
+ { \
+ B0 = veorq_u8(vaesdq_u8(B0, K), K2); \
+ B1 = veorq_u8(vaesdq_u8(B1, K), K2); \
+ B2 = veorq_u8(vaesdq_u8(B2, K), K2); \
+ B3 = veorq_u8(vaesdq_u8(B3, K), K2); \
+ } while(0)
+
+/*
+* AES-128 Encryption
+*/
+BOTAN_FUNC_ISA("+crypto")
+void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data());
+
+ const uint8x16_t K0 = vld1q_u8(skey + 0*16);
+ const uint8x16_t K1 = vld1q_u8(skey + 1*16);
+ const uint8x16_t K2 = vld1q_u8(skey + 2*16);
+ const uint8x16_t K3 = vld1q_u8(skey + 3*16);
+ const uint8x16_t K4 = vld1q_u8(skey + 4*16);
+ const uint8x16_t K5 = vld1q_u8(skey + 5*16);
+ const uint8x16_t K6 = vld1q_u8(skey + 6*16);
+ const uint8x16_t K7 = vld1q_u8(skey + 7*16);
+ const uint8x16_t K8 = vld1q_u8(skey + 8*16);
+ const uint8x16_t K9 = vld1q_u8(skey + 9*16);
+ const uint8x16_t K10 = vld1q_u8(skey + 10*16);
+
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_ENC_4_ROUNDS(K0);
+ AES_ENC_4_ROUNDS(K1);
+ AES_ENC_4_ROUNDS(K2);
+ AES_ENC_4_ROUNDS(K3);
+ AES_ENC_4_ROUNDS(K4);
+ AES_ENC_4_ROUNDS(K5);
+ AES_ENC_4_ROUNDS(K6);
+ AES_ENC_4_ROUNDS(K7);
+ AES_ENC_4_ROUNDS(K8);
+ AES_ENC_4_LAST_ROUNDS(K9, K10);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesmcq_u8(vaeseq_u8(B, K0));
+ B = vaesmcq_u8(vaeseq_u8(B, K1));
+ B = vaesmcq_u8(vaeseq_u8(B, K2));
+ B = vaesmcq_u8(vaeseq_u8(B, K3));
+ B = vaesmcq_u8(vaeseq_u8(B, K4));
+ B = vaesmcq_u8(vaeseq_u8(B, K5));
+ B = vaesmcq_u8(vaeseq_u8(B, K6));
+ B = vaesmcq_u8(vaeseq_u8(B, K7));
+ B = vaesmcq_u8(vaeseq_u8(B, K8));
+ B = veorq_u8(vaeseq_u8(B, K9), K10);
+ vst1q_u8(out+16*i, B);
+ }
+ }
+
+/*
+* AES-128 Decryption
+*/
+BOTAN_FUNC_ISA("+crypto")
+void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data());
+
+ const uint8x16_t K0 = vld1q_u8(skey + 0*16);
+ const uint8x16_t K1 = vld1q_u8(skey + 1*16);
+ const uint8x16_t K2 = vld1q_u8(skey + 2*16);
+ const uint8x16_t K3 = vld1q_u8(skey + 3*16);
+ const uint8x16_t K4 = vld1q_u8(skey + 4*16);
+ const uint8x16_t K5 = vld1q_u8(skey + 5*16);
+ const uint8x16_t K6 = vld1q_u8(skey + 6*16);
+ const uint8x16_t K7 = vld1q_u8(skey + 7*16);
+ const uint8x16_t K8 = vld1q_u8(skey + 8*16);
+ const uint8x16_t K9 = vld1q_u8(skey + 9*16);
+ const uint8x16_t K10 = vld1q_u8(skey + 10*16);
+
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_DEC_4_ROUNDS(K0);
+ AES_DEC_4_ROUNDS(K1);
+ AES_DEC_4_ROUNDS(K2);
+ AES_DEC_4_ROUNDS(K3);
+ AES_DEC_4_ROUNDS(K4);
+ AES_DEC_4_ROUNDS(K5);
+ AES_DEC_4_ROUNDS(K6);
+ AES_DEC_4_ROUNDS(K7);
+ AES_DEC_4_ROUNDS(K8);
+ AES_DEC_4_LAST_ROUNDS(K9, K10);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesimcq_u8(vaesdq_u8(B, K0));
+ B = vaesimcq_u8(vaesdq_u8(B, K1));
+ B = vaesimcq_u8(vaesdq_u8(B, K2));
+ B = vaesimcq_u8(vaesdq_u8(B, K3));
+ B = vaesimcq_u8(vaesdq_u8(B, K4));
+ B = vaesimcq_u8(vaesdq_u8(B, K5));
+ B = vaesimcq_u8(vaesdq_u8(B, K6));
+ B = vaesimcq_u8(vaesdq_u8(B, K7));
+ B = vaesimcq_u8(vaesdq_u8(B, K8));
+ B = veorq_u8(vaesdq_u8(B, K9), K10);
+ vst1q_u8(out+16*i, B);
+ }
+ }
+
+/*
+* AES-192 Encryption
+*/
+BOTAN_FUNC_ISA("+crypto")
+void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data());
+
+ const uint8x16_t K0 = vld1q_u8(skey + 0*16);
+ const uint8x16_t K1 = vld1q_u8(skey + 1*16);
+ const uint8x16_t K2 = vld1q_u8(skey + 2*16);
+ const uint8x16_t K3 = vld1q_u8(skey + 3*16);
+ const uint8x16_t K4 = vld1q_u8(skey + 4*16);
+ const uint8x16_t K5 = vld1q_u8(skey + 5*16);
+ const uint8x16_t K6 = vld1q_u8(skey + 6*16);
+ const uint8x16_t K7 = vld1q_u8(skey + 7*16);
+ const uint8x16_t K8 = vld1q_u8(skey + 8*16);
+ const uint8x16_t K9 = vld1q_u8(skey + 9*16);
+ const uint8x16_t K10 = vld1q_u8(skey + 10*16);
+ const uint8x16_t K11 = vld1q_u8(skey + 11*16);
+ const uint8x16_t K12 = vld1q_u8(skey + 12*16);
+
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_ENC_4_ROUNDS(K0);
+ AES_ENC_4_ROUNDS(K1);
+ AES_ENC_4_ROUNDS(K2);
+ AES_ENC_4_ROUNDS(K3);
+ AES_ENC_4_ROUNDS(K4);
+ AES_ENC_4_ROUNDS(K5);
+ AES_ENC_4_ROUNDS(K6);
+ AES_ENC_4_ROUNDS(K7);
+ AES_ENC_4_ROUNDS(K8);
+ AES_ENC_4_ROUNDS(K9);
+ AES_ENC_4_ROUNDS(K10);
+ AES_ENC_4_LAST_ROUNDS(K11, K12);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesmcq_u8(vaeseq_u8(B, K0));
+ B = vaesmcq_u8(vaeseq_u8(B, K1));
+ B = vaesmcq_u8(vaeseq_u8(B, K2));
+ B = vaesmcq_u8(vaeseq_u8(B, K3));
+ B = vaesmcq_u8(vaeseq_u8(B, K4));
+ B = vaesmcq_u8(vaeseq_u8(B, K5));
+ B = vaesmcq_u8(vaeseq_u8(B, K6));
+ B = vaesmcq_u8(vaeseq_u8(B, K7));
+ B = vaesmcq_u8(vaeseq_u8(B, K8));
+ B = vaesmcq_u8(vaeseq_u8(B, K9));
+ B = vaesmcq_u8(vaeseq_u8(B, K10));
+ B = veorq_u8(vaeseq_u8(B, K11), K12);
+ vst1q_u8(out+16*i, B);
+ }
+ }
+
+/*
+* AES-192 Decryption
+*/
+BOTAN_FUNC_ISA("+crypto")
+void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data());
+
+ const uint8x16_t K0 = vld1q_u8(skey + 0*16);
+ const uint8x16_t K1 = vld1q_u8(skey + 1*16);
+ const uint8x16_t K2 = vld1q_u8(skey + 2*16);
+ const uint8x16_t K3 = vld1q_u8(skey + 3*16);
+ const uint8x16_t K4 = vld1q_u8(skey + 4*16);
+ const uint8x16_t K5 = vld1q_u8(skey + 5*16);
+ const uint8x16_t K6 = vld1q_u8(skey + 6*16);
+ const uint8x16_t K7 = vld1q_u8(skey + 7*16);
+ const uint8x16_t K8 = vld1q_u8(skey + 8*16);
+ const uint8x16_t K9 = vld1q_u8(skey + 9*16);
+ const uint8x16_t K10 = vld1q_u8(skey + 10*16);
+ const uint8x16_t K11 = vld1q_u8(skey + 11*16);
+ const uint8x16_t K12 = vld1q_u8(skey + 12*16);
+
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_DEC_4_ROUNDS(K0);
+ AES_DEC_4_ROUNDS(K1);
+ AES_DEC_4_ROUNDS(K2);
+ AES_DEC_4_ROUNDS(K3);
+ AES_DEC_4_ROUNDS(K4);
+ AES_DEC_4_ROUNDS(K5);
+ AES_DEC_4_ROUNDS(K6);
+ AES_DEC_4_ROUNDS(K7);
+ AES_DEC_4_ROUNDS(K8);
+ AES_DEC_4_ROUNDS(K9);
+ AES_DEC_4_ROUNDS(K10);
+ AES_DEC_4_LAST_ROUNDS(K11, K12);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesimcq_u8(vaesdq_u8(B, K0));
+ B = vaesimcq_u8(vaesdq_u8(B, K1));
+ B = vaesimcq_u8(vaesdq_u8(B, K2));
+ B = vaesimcq_u8(vaesdq_u8(B, K3));
+ B = vaesimcq_u8(vaesdq_u8(B, K4));
+ B = vaesimcq_u8(vaesdq_u8(B, K5));
+ B = vaesimcq_u8(vaesdq_u8(B, K6));
+ B = vaesimcq_u8(vaesdq_u8(B, K7));
+ B = vaesimcq_u8(vaesdq_u8(B, K8));
+ B = vaesimcq_u8(vaesdq_u8(B, K9));
+ B = vaesimcq_u8(vaesdq_u8(B, K10));
+ B = veorq_u8(vaesdq_u8(B, K11), K12);
+ vst1q_u8(out+16*i, B);
+ }
+ }
+
+/*
+* AES-256 Encryption
+*/
+BOTAN_FUNC_ISA("+crypto")
+void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data());
+
+ const uint8x16_t K0 = vld1q_u8(skey + 0*16);
+ const uint8x16_t K1 = vld1q_u8(skey + 1*16);
+ const uint8x16_t K2 = vld1q_u8(skey + 2*16);
+ const uint8x16_t K3 = vld1q_u8(skey + 3*16);
+ const uint8x16_t K4 = vld1q_u8(skey + 4*16);
+ const uint8x16_t K5 = vld1q_u8(skey + 5*16);
+ const uint8x16_t K6 = vld1q_u8(skey + 6*16);
+ const uint8x16_t K7 = vld1q_u8(skey + 7*16);
+ const uint8x16_t K8 = vld1q_u8(skey + 8*16);
+ const uint8x16_t K9 = vld1q_u8(skey + 9*16);
+ const uint8x16_t K10 = vld1q_u8(skey + 10*16);
+ const uint8x16_t K11 = vld1q_u8(skey + 11*16);
+ const uint8x16_t K12 = vld1q_u8(skey + 12*16);
+ const uint8x16_t K13 = vld1q_u8(skey + 13*16);
+ const uint8x16_t K14 = vld1q_u8(skey + 14*16);
+
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_ENC_4_ROUNDS(K0);
+ AES_ENC_4_ROUNDS(K1);
+ AES_ENC_4_ROUNDS(K2);
+ AES_ENC_4_ROUNDS(K3);
+ AES_ENC_4_ROUNDS(K4);
+ AES_ENC_4_ROUNDS(K5);
+ AES_ENC_4_ROUNDS(K6);
+ AES_ENC_4_ROUNDS(K7);
+ AES_ENC_4_ROUNDS(K8);
+ AES_ENC_4_ROUNDS(K9);
+ AES_ENC_4_ROUNDS(K10);
+ AES_ENC_4_ROUNDS(K11);
+ AES_ENC_4_ROUNDS(K12);
+ AES_ENC_4_LAST_ROUNDS(K13, K14);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesmcq_u8(vaeseq_u8(B, K0));
+ B = vaesmcq_u8(vaeseq_u8(B, K1));
+ B = vaesmcq_u8(vaeseq_u8(B, K2));
+ B = vaesmcq_u8(vaeseq_u8(B, K3));
+ B = vaesmcq_u8(vaeseq_u8(B, K4));
+ B = vaesmcq_u8(vaeseq_u8(B, K5));
+ B = vaesmcq_u8(vaeseq_u8(B, K6));
+ B = vaesmcq_u8(vaeseq_u8(B, K7));
+ B = vaesmcq_u8(vaeseq_u8(B, K8));
+ B = vaesmcq_u8(vaeseq_u8(B, K9));
+ B = vaesmcq_u8(vaeseq_u8(B, K10));
+ B = vaesmcq_u8(vaeseq_u8(B, K11));
+ B = vaesmcq_u8(vaeseq_u8(B, K12));
+ B = veorq_u8(vaeseq_u8(B, K13), K14);
+ vst1q_u8(out+16*i, B);
+ }
+ }
+
+/*
+* AES-256 Decryption
+*/
+BOTAN_FUNC_ISA("+crypto")
+void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data());
+
+ const uint8x16_t K0 = vld1q_u8(skey + 0*16);
+ const uint8x16_t K1 = vld1q_u8(skey + 1*16);
+ const uint8x16_t K2 = vld1q_u8(skey + 2*16);
+ const uint8x16_t K3 = vld1q_u8(skey + 3*16);
+ const uint8x16_t K4 = vld1q_u8(skey + 4*16);
+ const uint8x16_t K5 = vld1q_u8(skey + 5*16);
+ const uint8x16_t K6 = vld1q_u8(skey + 6*16);
+ const uint8x16_t K7 = vld1q_u8(skey + 7*16);
+ const uint8x16_t K8 = vld1q_u8(skey + 8*16);
+ const uint8x16_t K9 = vld1q_u8(skey + 9*16);
+ const uint8x16_t K10 = vld1q_u8(skey + 10*16);
+ const uint8x16_t K11 = vld1q_u8(skey + 11*16);
+ const uint8x16_t K12 = vld1q_u8(skey + 12*16);
+ const uint8x16_t K13 = vld1q_u8(skey + 13*16);
+ const uint8x16_t K14 = vld1q_u8(skey + 14*16);
+
+ while(blocks >= 4)
+ {
+ uint8x16_t B0 = vld1q_u8(in);
+ uint8x16_t B1 = vld1q_u8(in+16);
+ uint8x16_t B2 = vld1q_u8(in+32);
+ uint8x16_t B3 = vld1q_u8(in+48);
+
+ AES_DEC_4_ROUNDS(K0);
+ AES_DEC_4_ROUNDS(K1);
+ AES_DEC_4_ROUNDS(K2);
+ AES_DEC_4_ROUNDS(K3);
+ AES_DEC_4_ROUNDS(K4);
+ AES_DEC_4_ROUNDS(K5);
+ AES_DEC_4_ROUNDS(K6);
+ AES_DEC_4_ROUNDS(K7);
+ AES_DEC_4_ROUNDS(K8);
+ AES_DEC_4_ROUNDS(K9);
+ AES_DEC_4_ROUNDS(K10);
+ AES_DEC_4_ROUNDS(K11);
+ AES_DEC_4_ROUNDS(K12);
+ AES_DEC_4_LAST_ROUNDS(K13, K14);
+
+ vst1q_u8(out, B0);
+ vst1q_u8(out+16, B1);
+ vst1q_u8(out+32, B2);
+ vst1q_u8(out+48, B3);
+
+ in += 16*4;
+ out += 16*4;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ uint8x16_t B = vld1q_u8(in+16*i);
+ B = vaesimcq_u8(vaesdq_u8(B, K0));
+ B = vaesimcq_u8(vaesdq_u8(B, K1));
+ B = vaesimcq_u8(vaesdq_u8(B, K2));
+ B = vaesimcq_u8(vaesdq_u8(B, K3));
+ B = vaesimcq_u8(vaesdq_u8(B, K4));
+ B = vaesimcq_u8(vaesdq_u8(B, K5));
+ B = vaesimcq_u8(vaesdq_u8(B, K6));
+ B = vaesimcq_u8(vaesdq_u8(B, K7));
+ B = vaesimcq_u8(vaesdq_u8(B, K8));
+ B = vaesimcq_u8(vaesdq_u8(B, K9));
+ B = vaesimcq_u8(vaesdq_u8(B, K10));
+ B = vaesimcq_u8(vaesdq_u8(B, K11));
+ B = vaesimcq_u8(vaesdq_u8(B, K12));
+ B = veorq_u8(vaesdq_u8(B, K13), K14);
+ vst1q_u8(out+16*i, B);
+ }
+ }
+
+#undef AES_ENC_4_ROUNDS
+#undef AES_ENC_4_LAST_ROUNDS
+#undef AES_DEC_4_ROUNDS
+#undef AES_DEC_4_LAST_ROUNDS
+
+}
diff --git a/comm/third_party/botan/src/lib/block/aes/aes_armv8/info.txt b/comm/third_party/botan/src/lib/block/aes/aes_armv8/info.txt
new file mode 100644
index 0000000000..1864f215b4
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes_armv8/info.txt
@@ -0,0 +1,12 @@
+<defines>
+AES_ARMV8 -> 20170903
+</defines>
+
+<isa>
+armv8crypto
+</isa>
+
+<cc>
+gcc:5
+clang:3.8
+</cc>
diff --git a/comm/third_party/botan/src/lib/block/aes/aes_ni/aes_ni.cpp b/comm/third_party/botan/src/lib/block/aes/aes_ni/aes_ni.cpp
new file mode 100644
index 0000000000..76c695f32c
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes_ni/aes_ni.cpp
@@ -0,0 +1,780 @@
+/*
+* AES using AES-NI instructions
+* (C) 2009,2012 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/aes.h>
+#include <botan/loadstor.h>
+#include <wmmintrin.h>
+
+namespace Botan {
+
+namespace {
+
+BOTAN_FUNC_ISA("ssse3")
+__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
+ {
+ key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
+ key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+ key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+ key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+ return _mm_xor_si128(key, key_with_rcon);
+ }
+
+BOTAN_FUNC_ISA("ssse3")
+void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
+ uint32_t out[], bool last)
+ {
+ __m128i key1 = *K1;
+ __m128i key2 = *K2;
+
+ key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
+ key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
+ key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
+ key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
+ key1 = _mm_xor_si128(key1, key2_with_rcon);
+
+ *K1 = key1;
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);
+
+ if(last)
+ return;
+
+ key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
+ key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
+
+ *K2 = key2;
+ out[4] = _mm_cvtsi128_si32(key2);
+ out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
+ }
+
+/*
+* The second half of the AES-256 key expansion (other half same as AES-128)
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+__m128i aes_256_key_expansion(__m128i key, __m128i key2)
+ {
+ __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
+ key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
+
+ key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+ key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+ key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+ return _mm_xor_si128(key, key_with_rcon);
+ }
+
+}
+
+#define AES_ENC_4_ROUNDS(K) \
+ do \
+ { \
+ B0 = _mm_aesenc_si128(B0, K); \
+ B1 = _mm_aesenc_si128(B1, K); \
+ B2 = _mm_aesenc_si128(B2, K); \
+ B3 = _mm_aesenc_si128(B3, K); \
+ } while(0)
+
+#define AES_ENC_4_LAST_ROUNDS(K) \
+ do \
+ { \
+ B0 = _mm_aesenclast_si128(B0, K); \
+ B1 = _mm_aesenclast_si128(B1, K); \
+ B2 = _mm_aesenclast_si128(B2, K); \
+ B3 = _mm_aesenclast_si128(B3, K); \
+ } while(0)
+
+#define AES_DEC_4_ROUNDS(K) \
+ do \
+ { \
+ B0 = _mm_aesdec_si128(B0, K); \
+ B1 = _mm_aesdec_si128(B1, K); \
+ B2 = _mm_aesdec_si128(B2, K); \
+ B3 = _mm_aesdec_si128(B3, K); \
+ } while(0)
+
+#define AES_DEC_4_LAST_ROUNDS(K) \
+ do \
+ { \
+ B0 = _mm_aesdeclast_si128(B0, K); \
+ B1 = _mm_aesdeclast_si128(B1, K); \
+ B2 = _mm_aesdeclast_si128(B2, K); \
+ B3 = _mm_aesdeclast_si128(B3, K); \
+ } while(0)
+
+/*
+* AES-128 Encryption
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
+ __m128i* out_mm = reinterpret_cast<__m128i*>(out);
+
+ const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
+
+ const __m128i K0 = _mm_loadu_si128(key_mm);
+ const __m128i K1 = _mm_loadu_si128(key_mm + 1);
+ const __m128i K2 = _mm_loadu_si128(key_mm + 2);
+ const __m128i K3 = _mm_loadu_si128(key_mm + 3);
+ const __m128i K4 = _mm_loadu_si128(key_mm + 4);
+ const __m128i K5 = _mm_loadu_si128(key_mm + 5);
+ const __m128i K6 = _mm_loadu_si128(key_mm + 6);
+ const __m128i K7 = _mm_loadu_si128(key_mm + 7);
+ const __m128i K8 = _mm_loadu_si128(key_mm + 8);
+ const __m128i K9 = _mm_loadu_si128(key_mm + 9);
+ const __m128i K10 = _mm_loadu_si128(key_mm + 10);
+
+ while(blocks >= 4)
+ {
+ __m128i B0 = _mm_loadu_si128(in_mm + 0);
+ __m128i B1 = _mm_loadu_si128(in_mm + 1);
+ __m128i B2 = _mm_loadu_si128(in_mm + 2);
+ __m128i B3 = _mm_loadu_si128(in_mm + 3);
+
+ B0 = _mm_xor_si128(B0, K0);
+ B1 = _mm_xor_si128(B1, K0);
+ B2 = _mm_xor_si128(B2, K0);
+ B3 = _mm_xor_si128(B3, K0);
+
+ AES_ENC_4_ROUNDS(K1);
+ AES_ENC_4_ROUNDS(K2);
+ AES_ENC_4_ROUNDS(K3);
+ AES_ENC_4_ROUNDS(K4);
+ AES_ENC_4_ROUNDS(K5);
+ AES_ENC_4_ROUNDS(K6);
+ AES_ENC_4_ROUNDS(K7);
+ AES_ENC_4_ROUNDS(K8);
+ AES_ENC_4_ROUNDS(K9);
+ AES_ENC_4_LAST_ROUNDS(K10);
+
+ _mm_storeu_si128(out_mm + 0, B0);
+ _mm_storeu_si128(out_mm + 1, B1);
+ _mm_storeu_si128(out_mm + 2, B2);
+ _mm_storeu_si128(out_mm + 3, B3);
+
+ blocks -= 4;
+ in_mm += 4;
+ out_mm += 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ __m128i B = _mm_loadu_si128(in_mm + i);
+
+ B = _mm_xor_si128(B, K0);
+
+ B = _mm_aesenc_si128(B, K1);
+ B = _mm_aesenc_si128(B, K2);
+ B = _mm_aesenc_si128(B, K3);
+ B = _mm_aesenc_si128(B, K4);
+ B = _mm_aesenc_si128(B, K5);
+ B = _mm_aesenc_si128(B, K6);
+ B = _mm_aesenc_si128(B, K7);
+ B = _mm_aesenc_si128(B, K8);
+ B = _mm_aesenc_si128(B, K9);
+ B = _mm_aesenclast_si128(B, K10);
+
+ _mm_storeu_si128(out_mm + i, B);
+ }
+ }
+
+/*
+* AES-128 Decryption
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
+ __m128i* out_mm = reinterpret_cast<__m128i*>(out);
+
+ const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
+
+ const __m128i K0 = _mm_loadu_si128(key_mm);
+ const __m128i K1 = _mm_loadu_si128(key_mm + 1);
+ const __m128i K2 = _mm_loadu_si128(key_mm + 2);
+ const __m128i K3 = _mm_loadu_si128(key_mm + 3);
+ const __m128i K4 = _mm_loadu_si128(key_mm + 4);
+ const __m128i K5 = _mm_loadu_si128(key_mm + 5);
+ const __m128i K6 = _mm_loadu_si128(key_mm + 6);
+ const __m128i K7 = _mm_loadu_si128(key_mm + 7);
+ const __m128i K8 = _mm_loadu_si128(key_mm + 8);
+ const __m128i K9 = _mm_loadu_si128(key_mm + 9);
+ const __m128i K10 = _mm_loadu_si128(key_mm + 10);
+
+ while(blocks >= 4)
+ {
+ __m128i B0 = _mm_loadu_si128(in_mm + 0);
+ __m128i B1 = _mm_loadu_si128(in_mm + 1);
+ __m128i B2 = _mm_loadu_si128(in_mm + 2);
+ __m128i B3 = _mm_loadu_si128(in_mm + 3);
+
+ B0 = _mm_xor_si128(B0, K0);
+ B1 = _mm_xor_si128(B1, K0);
+ B2 = _mm_xor_si128(B2, K0);
+ B3 = _mm_xor_si128(B3, K0);
+
+ AES_DEC_4_ROUNDS(K1);
+ AES_DEC_4_ROUNDS(K2);
+ AES_DEC_4_ROUNDS(K3);
+ AES_DEC_4_ROUNDS(K4);
+ AES_DEC_4_ROUNDS(K5);
+ AES_DEC_4_ROUNDS(K6);
+ AES_DEC_4_ROUNDS(K7);
+ AES_DEC_4_ROUNDS(K8);
+ AES_DEC_4_ROUNDS(K9);
+ AES_DEC_4_LAST_ROUNDS(K10);
+
+ _mm_storeu_si128(out_mm + 0, B0);
+ _mm_storeu_si128(out_mm + 1, B1);
+ _mm_storeu_si128(out_mm + 2, B2);
+ _mm_storeu_si128(out_mm + 3, B3);
+
+ blocks -= 4;
+ in_mm += 4;
+ out_mm += 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ __m128i B = _mm_loadu_si128(in_mm + i);
+
+ B = _mm_xor_si128(B, K0);
+
+ B = _mm_aesdec_si128(B, K1);
+ B = _mm_aesdec_si128(B, K2);
+ B = _mm_aesdec_si128(B, K3);
+ B = _mm_aesdec_si128(B, K4);
+ B = _mm_aesdec_si128(B, K5);
+ B = _mm_aesdec_si128(B, K6);
+ B = _mm_aesdec_si128(B, K7);
+ B = _mm_aesdec_si128(B, K8);
+ B = _mm_aesdec_si128(B, K9);
+ B = _mm_aesdeclast_si128(B, K10);
+
+ _mm_storeu_si128(out_mm + i, B);
+ }
+ }
+
+/*
+* AES-128 Key Schedule
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+void AES_128::aesni_key_schedule(const uint8_t key[], size_t)
+ {
+ m_EK.resize(44);
+ m_DK.resize(44);
+
+ #define AES_128_key_exp(K, RCON) \
+ aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
+
+ const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
+ const __m128i K1 = AES_128_key_exp(K0, 0x01);
+ const __m128i K2 = AES_128_key_exp(K1, 0x02);
+ const __m128i K3 = AES_128_key_exp(K2, 0x04);
+ const __m128i K4 = AES_128_key_exp(K3, 0x08);
+ const __m128i K5 = AES_128_key_exp(K4, 0x10);
+ const __m128i K6 = AES_128_key_exp(K5, 0x20);
+ const __m128i K7 = AES_128_key_exp(K6, 0x40);
+ const __m128i K8 = AES_128_key_exp(K7, 0x80);
+ const __m128i K9 = AES_128_key_exp(K8, 0x1B);
+ const __m128i K10 = AES_128_key_exp(K9, 0x36);
+
+ __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
+ _mm_storeu_si128(EK_mm , K0);
+ _mm_storeu_si128(EK_mm + 1, K1);
+ _mm_storeu_si128(EK_mm + 2, K2);
+ _mm_storeu_si128(EK_mm + 3, K3);
+ _mm_storeu_si128(EK_mm + 4, K4);
+ _mm_storeu_si128(EK_mm + 5, K5);
+ _mm_storeu_si128(EK_mm + 6, K6);
+ _mm_storeu_si128(EK_mm + 7, K7);
+ _mm_storeu_si128(EK_mm + 8, K8);
+ _mm_storeu_si128(EK_mm + 9, K9);
+ _mm_storeu_si128(EK_mm + 10, K10);
+
+ // Now generate decryption keys
+
+ __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
+ _mm_storeu_si128(DK_mm , K10);
+ _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
+ _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
+ _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
+ _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
+ _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
+ _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
+ _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
+ _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
+ _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
+ _mm_storeu_si128(DK_mm + 10, K0);
+ }
+
+/*
+* AES-192 Encryption
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
+ __m128i* out_mm = reinterpret_cast<__m128i*>(out);
+
+ const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
+
+ const __m128i K0 = _mm_loadu_si128(key_mm);
+ const __m128i K1 = _mm_loadu_si128(key_mm + 1);
+ const __m128i K2 = _mm_loadu_si128(key_mm + 2);
+ const __m128i K3 = _mm_loadu_si128(key_mm + 3);
+ const __m128i K4 = _mm_loadu_si128(key_mm + 4);
+ const __m128i K5 = _mm_loadu_si128(key_mm + 5);
+ const __m128i K6 = _mm_loadu_si128(key_mm + 6);
+ const __m128i K7 = _mm_loadu_si128(key_mm + 7);
+ const __m128i K8 = _mm_loadu_si128(key_mm + 8);
+ const __m128i K9 = _mm_loadu_si128(key_mm + 9);
+ const __m128i K10 = _mm_loadu_si128(key_mm + 10);
+ const __m128i K11 = _mm_loadu_si128(key_mm + 11);
+ const __m128i K12 = _mm_loadu_si128(key_mm + 12);
+
+ while(blocks >= 4)
+ {
+ __m128i B0 = _mm_loadu_si128(in_mm + 0);
+ __m128i B1 = _mm_loadu_si128(in_mm + 1);
+ __m128i B2 = _mm_loadu_si128(in_mm + 2);
+ __m128i B3 = _mm_loadu_si128(in_mm + 3);
+
+ B0 = _mm_xor_si128(B0, K0);
+ B1 = _mm_xor_si128(B1, K0);
+ B2 = _mm_xor_si128(B2, K0);
+ B3 = _mm_xor_si128(B3, K0);
+
+ AES_ENC_4_ROUNDS(K1);
+ AES_ENC_4_ROUNDS(K2);
+ AES_ENC_4_ROUNDS(K3);
+ AES_ENC_4_ROUNDS(K4);
+ AES_ENC_4_ROUNDS(K5);
+ AES_ENC_4_ROUNDS(K6);
+ AES_ENC_4_ROUNDS(K7);
+ AES_ENC_4_ROUNDS(K8);
+ AES_ENC_4_ROUNDS(K9);
+ AES_ENC_4_ROUNDS(K10);
+ AES_ENC_4_ROUNDS(K11);
+ AES_ENC_4_LAST_ROUNDS(K12);
+
+ _mm_storeu_si128(out_mm + 0, B0);
+ _mm_storeu_si128(out_mm + 1, B1);
+ _mm_storeu_si128(out_mm + 2, B2);
+ _mm_storeu_si128(out_mm + 3, B3);
+
+ blocks -= 4;
+ in_mm += 4;
+ out_mm += 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ __m128i B = _mm_loadu_si128(in_mm + i);
+
+ B = _mm_xor_si128(B, K0);
+
+ B = _mm_aesenc_si128(B, K1);
+ B = _mm_aesenc_si128(B, K2);
+ B = _mm_aesenc_si128(B, K3);
+ B = _mm_aesenc_si128(B, K4);
+ B = _mm_aesenc_si128(B, K5);
+ B = _mm_aesenc_si128(B, K6);
+ B = _mm_aesenc_si128(B, K7);
+ B = _mm_aesenc_si128(B, K8);
+ B = _mm_aesenc_si128(B, K9);
+ B = _mm_aesenc_si128(B, K10);
+ B = _mm_aesenc_si128(B, K11);
+ B = _mm_aesenclast_si128(B, K12);
+
+ _mm_storeu_si128(out_mm + i, B);
+ }
+ }
+
+/*
+* AES-192 Decryption
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
+ __m128i* out_mm = reinterpret_cast<__m128i*>(out);
+
+ const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
+
+ const __m128i K0 = _mm_loadu_si128(key_mm);
+ const __m128i K1 = _mm_loadu_si128(key_mm + 1);
+ const __m128i K2 = _mm_loadu_si128(key_mm + 2);
+ const __m128i K3 = _mm_loadu_si128(key_mm + 3);
+ const __m128i K4 = _mm_loadu_si128(key_mm + 4);
+ const __m128i K5 = _mm_loadu_si128(key_mm + 5);
+ const __m128i K6 = _mm_loadu_si128(key_mm + 6);
+ const __m128i K7 = _mm_loadu_si128(key_mm + 7);
+ const __m128i K8 = _mm_loadu_si128(key_mm + 8);
+ const __m128i K9 = _mm_loadu_si128(key_mm + 9);
+ const __m128i K10 = _mm_loadu_si128(key_mm + 10);
+ const __m128i K11 = _mm_loadu_si128(key_mm + 11);
+ const __m128i K12 = _mm_loadu_si128(key_mm + 12);
+
+ while(blocks >= 4)
+ {
+ __m128i B0 = _mm_loadu_si128(in_mm + 0);
+ __m128i B1 = _mm_loadu_si128(in_mm + 1);
+ __m128i B2 = _mm_loadu_si128(in_mm + 2);
+ __m128i B3 = _mm_loadu_si128(in_mm + 3);
+
+ B0 = _mm_xor_si128(B0, K0);
+ B1 = _mm_xor_si128(B1, K0);
+ B2 = _mm_xor_si128(B2, K0);
+ B3 = _mm_xor_si128(B3, K0);
+
+ AES_DEC_4_ROUNDS(K1);
+ AES_DEC_4_ROUNDS(K2);
+ AES_DEC_4_ROUNDS(K3);
+ AES_DEC_4_ROUNDS(K4);
+ AES_DEC_4_ROUNDS(K5);
+ AES_DEC_4_ROUNDS(K6);
+ AES_DEC_4_ROUNDS(K7);
+ AES_DEC_4_ROUNDS(K8);
+ AES_DEC_4_ROUNDS(K9);
+ AES_DEC_4_ROUNDS(K10);
+ AES_DEC_4_ROUNDS(K11);
+ AES_DEC_4_LAST_ROUNDS(K12);
+
+ _mm_storeu_si128(out_mm + 0, B0);
+ _mm_storeu_si128(out_mm + 1, B1);
+ _mm_storeu_si128(out_mm + 2, B2);
+ _mm_storeu_si128(out_mm + 3, B3);
+
+ blocks -= 4;
+ in_mm += 4;
+ out_mm += 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ __m128i B = _mm_loadu_si128(in_mm + i);
+
+ B = _mm_xor_si128(B, K0);
+
+ B = _mm_aesdec_si128(B, K1);
+ B = _mm_aesdec_si128(B, K2);
+ B = _mm_aesdec_si128(B, K3);
+ B = _mm_aesdec_si128(B, K4);
+ B = _mm_aesdec_si128(B, K5);
+ B = _mm_aesdec_si128(B, K6);
+ B = _mm_aesdec_si128(B, K7);
+ B = _mm_aesdec_si128(B, K8);
+ B = _mm_aesdec_si128(B, K9);
+ B = _mm_aesdec_si128(B, K10);
+ B = _mm_aesdec_si128(B, K11);
+ B = _mm_aesdeclast_si128(B, K12);
+
+ _mm_storeu_si128(out_mm + i, B);
+ }
+ }
+
+/*
+* AES-192 Key Schedule
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+void AES_192::aesni_key_schedule(const uint8_t key[], size_t)
+ {
+ m_EK.resize(52);
+ m_DK.resize(52);
+
+ __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
+ __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
+ K1 = _mm_srli_si128(K1, 8);
+
+ load_le(m_EK.data(), key, 6);
+
+ #define AES_192_key_exp(RCON, EK_OFF) \
+ aes_192_key_expansion(&K0, &K1, \
+ _mm_aeskeygenassist_si128(K1, RCON), \
+ &m_EK[EK_OFF], EK_OFF == 48)
+
+ AES_192_key_exp(0x01, 6);
+ AES_192_key_exp(0x02, 12);
+ AES_192_key_exp(0x04, 18);
+ AES_192_key_exp(0x08, 24);
+ AES_192_key_exp(0x10, 30);
+ AES_192_key_exp(0x20, 36);
+ AES_192_key_exp(0x40, 42);
+ AES_192_key_exp(0x80, 48);
+
+ #undef AES_192_key_exp
+
+ // Now generate decryption keys
+ const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
+
+ __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
+ _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12));
+ _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
+ _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
+ _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
+ _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
+ _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
+ _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
+ _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
+ _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
+ _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
+ _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
+ _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
+ _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
+ }
+
+/*
+* AES-256 Encryption
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
+ __m128i* out_mm = reinterpret_cast<__m128i*>(out);
+
+ const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
+
+ const __m128i K0 = _mm_loadu_si128(key_mm);
+ const __m128i K1 = _mm_loadu_si128(key_mm + 1);
+ const __m128i K2 = _mm_loadu_si128(key_mm + 2);
+ const __m128i K3 = _mm_loadu_si128(key_mm + 3);
+ const __m128i K4 = _mm_loadu_si128(key_mm + 4);
+ const __m128i K5 = _mm_loadu_si128(key_mm + 5);
+ const __m128i K6 = _mm_loadu_si128(key_mm + 6);
+ const __m128i K7 = _mm_loadu_si128(key_mm + 7);
+ const __m128i K8 = _mm_loadu_si128(key_mm + 8);
+ const __m128i K9 = _mm_loadu_si128(key_mm + 9);
+ const __m128i K10 = _mm_loadu_si128(key_mm + 10);
+ const __m128i K11 = _mm_loadu_si128(key_mm + 11);
+ const __m128i K12 = _mm_loadu_si128(key_mm + 12);
+ const __m128i K13 = _mm_loadu_si128(key_mm + 13);
+ const __m128i K14 = _mm_loadu_si128(key_mm + 14);
+
+ while(blocks >= 4)
+ {
+ __m128i B0 = _mm_loadu_si128(in_mm + 0);
+ __m128i B1 = _mm_loadu_si128(in_mm + 1);
+ __m128i B2 = _mm_loadu_si128(in_mm + 2);
+ __m128i B3 = _mm_loadu_si128(in_mm + 3);
+
+ B0 = _mm_xor_si128(B0, K0);
+ B1 = _mm_xor_si128(B1, K0);
+ B2 = _mm_xor_si128(B2, K0);
+ B3 = _mm_xor_si128(B3, K0);
+
+ AES_ENC_4_ROUNDS(K1);
+ AES_ENC_4_ROUNDS(K2);
+ AES_ENC_4_ROUNDS(K3);
+ AES_ENC_4_ROUNDS(K4);
+ AES_ENC_4_ROUNDS(K5);
+ AES_ENC_4_ROUNDS(K6);
+ AES_ENC_4_ROUNDS(K7);
+ AES_ENC_4_ROUNDS(K8);
+ AES_ENC_4_ROUNDS(K9);
+ AES_ENC_4_ROUNDS(K10);
+ AES_ENC_4_ROUNDS(K11);
+ AES_ENC_4_ROUNDS(K12);
+ AES_ENC_4_ROUNDS(K13);
+ AES_ENC_4_LAST_ROUNDS(K14);
+
+ _mm_storeu_si128(out_mm + 0, B0);
+ _mm_storeu_si128(out_mm + 1, B1);
+ _mm_storeu_si128(out_mm + 2, B2);
+ _mm_storeu_si128(out_mm + 3, B3);
+
+ blocks -= 4;
+ in_mm += 4;
+ out_mm += 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ __m128i B = _mm_loadu_si128(in_mm + i);
+
+ B = _mm_xor_si128(B, K0);
+
+ B = _mm_aesenc_si128(B, K1);
+ B = _mm_aesenc_si128(B, K2);
+ B = _mm_aesenc_si128(B, K3);
+ B = _mm_aesenc_si128(B, K4);
+ B = _mm_aesenc_si128(B, K5);
+ B = _mm_aesenc_si128(B, K6);
+ B = _mm_aesenc_si128(B, K7);
+ B = _mm_aesenc_si128(B, K8);
+ B = _mm_aesenc_si128(B, K9);
+ B = _mm_aesenc_si128(B, K10);
+ B = _mm_aesenc_si128(B, K11);
+ B = _mm_aesenc_si128(B, K12);
+ B = _mm_aesenc_si128(B, K13);
+ B = _mm_aesenclast_si128(B, K14);
+
+ _mm_storeu_si128(out_mm + i, B);
+ }
+ }
+
+/*
+* AES-256 Decryption
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
+ __m128i* out_mm = reinterpret_cast<__m128i*>(out);
+
+ const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
+
+ const __m128i K0 = _mm_loadu_si128(key_mm);
+ const __m128i K1 = _mm_loadu_si128(key_mm + 1);
+ const __m128i K2 = _mm_loadu_si128(key_mm + 2);
+ const __m128i K3 = _mm_loadu_si128(key_mm + 3);
+ const __m128i K4 = _mm_loadu_si128(key_mm + 4);
+ const __m128i K5 = _mm_loadu_si128(key_mm + 5);
+ const __m128i K6 = _mm_loadu_si128(key_mm + 6);
+ const __m128i K7 = _mm_loadu_si128(key_mm + 7);
+ const __m128i K8 = _mm_loadu_si128(key_mm + 8);
+ const __m128i K9 = _mm_loadu_si128(key_mm + 9);
+ const __m128i K10 = _mm_loadu_si128(key_mm + 10);
+ const __m128i K11 = _mm_loadu_si128(key_mm + 11);
+ const __m128i K12 = _mm_loadu_si128(key_mm + 12);
+ const __m128i K13 = _mm_loadu_si128(key_mm + 13);
+ const __m128i K14 = _mm_loadu_si128(key_mm + 14);
+
+ while(blocks >= 4)
+ {
+ __m128i B0 = _mm_loadu_si128(in_mm + 0);
+ __m128i B1 = _mm_loadu_si128(in_mm + 1);
+ __m128i B2 = _mm_loadu_si128(in_mm + 2);
+ __m128i B3 = _mm_loadu_si128(in_mm + 3);
+
+ B0 = _mm_xor_si128(B0, K0);
+ B1 = _mm_xor_si128(B1, K0);
+ B2 = _mm_xor_si128(B2, K0);
+ B3 = _mm_xor_si128(B3, K0);
+
+ AES_DEC_4_ROUNDS(K1);
+ AES_DEC_4_ROUNDS(K2);
+ AES_DEC_4_ROUNDS(K3);
+ AES_DEC_4_ROUNDS(K4);
+ AES_DEC_4_ROUNDS(K5);
+ AES_DEC_4_ROUNDS(K6);
+ AES_DEC_4_ROUNDS(K7);
+ AES_DEC_4_ROUNDS(K8);
+ AES_DEC_4_ROUNDS(K9);
+ AES_DEC_4_ROUNDS(K10);
+ AES_DEC_4_ROUNDS(K11);
+ AES_DEC_4_ROUNDS(K12);
+ AES_DEC_4_ROUNDS(K13);
+ AES_DEC_4_LAST_ROUNDS(K14);
+
+ _mm_storeu_si128(out_mm + 0, B0);
+ _mm_storeu_si128(out_mm + 1, B1);
+ _mm_storeu_si128(out_mm + 2, B2);
+ _mm_storeu_si128(out_mm + 3, B3);
+
+ blocks -= 4;
+ in_mm += 4;
+ out_mm += 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ __m128i B = _mm_loadu_si128(in_mm + i);
+
+ B = _mm_xor_si128(B, K0);
+
+ B = _mm_aesdec_si128(B, K1);
+ B = _mm_aesdec_si128(B, K2);
+ B = _mm_aesdec_si128(B, K3);
+ B = _mm_aesdec_si128(B, K4);
+ B = _mm_aesdec_si128(B, K5);
+ B = _mm_aesdec_si128(B, K6);
+ B = _mm_aesdec_si128(B, K7);
+ B = _mm_aesdec_si128(B, K8);
+ B = _mm_aesdec_si128(B, K9);
+ B = _mm_aesdec_si128(B, K10);
+ B = _mm_aesdec_si128(B, K11);
+ B = _mm_aesdec_si128(B, K12);
+ B = _mm_aesdec_si128(B, K13);
+ B = _mm_aesdeclast_si128(B, K14);
+
+ _mm_storeu_si128(out_mm + i, B);
+ }
+ }
+
+/*
+* AES-256 Key Schedule
+*/
+BOTAN_FUNC_ISA("ssse3,aes")
+void AES_256::aesni_key_schedule(const uint8_t key[], size_t)
+ {
+ m_EK.resize(60);
+ m_DK.resize(60);
+
+ const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
+ const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
+
+ const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
+ const __m128i K3 = aes_256_key_expansion(K1, K2);
+
+ const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
+ const __m128i K5 = aes_256_key_expansion(K3, K4);
+
+ const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
+ const __m128i K7 = aes_256_key_expansion(K5, K6);
+
+ const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
+ const __m128i K9 = aes_256_key_expansion(K7, K8);
+
+ const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
+ const __m128i K11 = aes_256_key_expansion(K9, K10);
+
+ const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
+ const __m128i K13 = aes_256_key_expansion(K11, K12);
+
+ const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
+
+ __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
+ _mm_storeu_si128(EK_mm , K0);
+ _mm_storeu_si128(EK_mm + 1, K1);
+ _mm_storeu_si128(EK_mm + 2, K2);
+ _mm_storeu_si128(EK_mm + 3, K3);
+ _mm_storeu_si128(EK_mm + 4, K4);
+ _mm_storeu_si128(EK_mm + 5, K5);
+ _mm_storeu_si128(EK_mm + 6, K6);
+ _mm_storeu_si128(EK_mm + 7, K7);
+ _mm_storeu_si128(EK_mm + 8, K8);
+ _mm_storeu_si128(EK_mm + 9, K9);
+ _mm_storeu_si128(EK_mm + 10, K10);
+ _mm_storeu_si128(EK_mm + 11, K11);
+ _mm_storeu_si128(EK_mm + 12, K12);
+ _mm_storeu_si128(EK_mm + 13, K13);
+ _mm_storeu_si128(EK_mm + 14, K14);
+
+ // Now generate decryption keys
+ __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
+ _mm_storeu_si128(DK_mm , K14);
+ _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
+ _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
+ _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
+ _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
+ _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
+ _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
+ _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
+ _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
+ _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
+ _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
+ _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
+ _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
+ _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
+ _mm_storeu_si128(DK_mm + 14, K0);
+ }
+
+#undef AES_ENC_4_ROUNDS
+#undef AES_ENC_4_LAST_ROUNDS
+#undef AES_DEC_4_ROUNDS
+#undef AES_DEC_4_LAST_ROUNDS
+
+}
diff --git a/comm/third_party/botan/src/lib/block/aes/aes_ni/info.txt b/comm/third_party/botan/src/lib/block/aes/aes_ni/info.txt
new file mode 100644
index 0000000000..2e9749fb8e
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes_ni/info.txt
@@ -0,0 +1,9 @@
+<defines>
+AES_NI -> 20131128
+</defines>
+
+<isa>
+sse2
+ssse3
+aesni
+</isa>
diff --git a/comm/third_party/botan/src/lib/block/aes/aes_power8/aes_power8.cpp b/comm/third_party/botan/src/lib/block/aes/aes_power8/aes_power8.cpp
new file mode 100644
index 0000000000..18bc85933b
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes_power8/aes_power8.cpp
@@ -0,0 +1,529 @@
+/*
+* AES using POWER8/POWER9 crypto extensions
+*
+* Contributed by Jeffrey Walton
+*
+* Further changes
+* (C) 2018,2019 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/aes.h>
+#include <botan/cpuid.h>
+
+#include <altivec.h>
+#undef vector
+#undef bool
+
+namespace Botan {
+
+typedef __vector unsigned long long Altivec64x2;
+typedef __vector unsigned int Altivec32x4;
+typedef __vector unsigned char Altivec8x16;
+
+namespace {
+
+inline Altivec8x16 reverse_vec(Altivec8x16 src)
+ {
+ if(CPUID::is_little_endian())
+ {
+ const Altivec8x16 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+ const Altivec8x16 zero = {0};
+ return vec_perm(src, zero, mask);
+ }
+ else
+ {
+ return src;
+ }
+ }
+
+inline Altivec64x2 load_key(const uint32_t key[])
+ {
+ return (Altivec64x2)reverse_vec((Altivec8x16)vec_vsx_ld(0, key));;
+ }
+
+inline Altivec64x2 load_block(const uint8_t src[])
+ {
+ return (Altivec64x2)reverse_vec(vec_vsx_ld(0, src));
+ }
+
+inline void store_block(Altivec64x2 src, uint8_t dest[])
+ {
+ vec_vsx_st(reverse_vec((Altivec8x16)src), 0, dest);
+ }
+
+inline void store_blocks(Altivec64x2 B0, Altivec64x2 B1,
+ Altivec64x2 B2, Altivec64x2 B3,
+ uint8_t out[])
+ {
+ store_block(B0, out);
+ store_block(B1, out+16);
+ store_block(B2, out+16*2);
+ store_block(B3, out+16*3);
+ }
+
+#define AES_XOR_4(B0, B1, B2, B3, K) do { \
+ B0 = vec_xor(B0, K); \
+ B1 = vec_xor(B1, K); \
+ B2 = vec_xor(B2, K); \
+ B3 = vec_xor(B3, K); \
+ } while(0)
+
+#define AES_ENCRYPT_4(B0, B1, B2, B3, K) do { \
+ B0 = __builtin_crypto_vcipher(B0, K); \
+ B1 = __builtin_crypto_vcipher(B1, K); \
+ B2 = __builtin_crypto_vcipher(B2, K); \
+ B3 = __builtin_crypto_vcipher(B3, K); \
+ } while(0)
+
+#define AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K) do { \
+ B0 = __builtin_crypto_vcipherlast(B0, K); \
+ B1 = __builtin_crypto_vcipherlast(B1, K); \
+ B2 = __builtin_crypto_vcipherlast(B2, K); \
+ B3 = __builtin_crypto_vcipherlast(B3, K); \
+ } while(0)
+
+#define AES_DECRYPT_4(B0, B1, B2, B3, K) do { \
+ B0 = __builtin_crypto_vncipher(B0, K); \
+ B1 = __builtin_crypto_vncipher(B1, K); \
+ B2 = __builtin_crypto_vncipher(B2, K); \
+ B3 = __builtin_crypto_vncipher(B3, K); \
+ } while(0)
+
+#define AES_DECRYPT_4_LAST(B0, B1, B2, B3, K) do { \
+ B0 = __builtin_crypto_vncipherlast(B0, K); \
+ B1 = __builtin_crypto_vncipherlast(B1, K); \
+ B2 = __builtin_crypto_vncipherlast(B2, K); \
+ B3 = __builtin_crypto_vncipherlast(B3, K); \
+ } while(0)
+
+}
+
+BOTAN_FUNC_ISA("crypto")
+void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const Altivec64x2 K0 = load_key(&m_EK[0]);
+ const Altivec64x2 K1 = load_key(&m_EK[4]);
+ const Altivec64x2 K2 = load_key(&m_EK[8]);
+ const Altivec64x2 K3 = load_key(&m_EK[12]);
+ const Altivec64x2 K4 = load_key(&m_EK[16]);
+ const Altivec64x2 K5 = load_key(&m_EK[20]);
+ const Altivec64x2 K6 = load_key(&m_EK[24]);
+ const Altivec64x2 K7 = load_key(&m_EK[28]);
+ const Altivec64x2 K8 = load_key(&m_EK[32]);
+ const Altivec64x2 K9 = load_key(&m_EK[36]);
+ const Altivec64x2 K10 = load_key(&m_EK[40]);
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K1);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K2);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K3);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K4);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K5);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K6);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K7);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K8);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K9);
+ AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K10);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ Altivec64x2 B = load_block(in);
+
+ B = vec_xor(B, K0);
+ B = __builtin_crypto_vcipher(B, K1);
+ B = __builtin_crypto_vcipher(B, K2);
+ B = __builtin_crypto_vcipher(B, K3);
+ B = __builtin_crypto_vcipher(B, K4);
+ B = __builtin_crypto_vcipher(B, K5);
+ B = __builtin_crypto_vcipher(B, K6);
+ B = __builtin_crypto_vcipher(B, K7);
+ B = __builtin_crypto_vcipher(B, K8);
+ B = __builtin_crypto_vcipher(B, K9);
+ B = __builtin_crypto_vcipherlast(B, K10);
+
+ store_block(B, out);
+
+ out += 16;
+ in += 16;
+ }
+ }
+
+BOTAN_FUNC_ISA("crypto")
+void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const Altivec64x2 K0 = load_key(&m_EK[40]);
+ const Altivec64x2 K1 = load_key(&m_EK[36]);
+ const Altivec64x2 K2 = load_key(&m_EK[32]);
+ const Altivec64x2 K3 = load_key(&m_EK[28]);
+ const Altivec64x2 K4 = load_key(&m_EK[24]);
+ const Altivec64x2 K5 = load_key(&m_EK[20]);
+ const Altivec64x2 K6 = load_key(&m_EK[16]);
+ const Altivec64x2 K7 = load_key(&m_EK[12]);
+ const Altivec64x2 K8 = load_key(&m_EK[8]);
+ const Altivec64x2 K9 = load_key(&m_EK[4]);
+ const Altivec64x2 K10 = load_key(&m_EK[0]);
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_DECRYPT_4(B0, B1, B2, B3, K1);
+ AES_DECRYPT_4(B0, B1, B2, B3, K2);
+ AES_DECRYPT_4(B0, B1, B2, B3, K3);
+ AES_DECRYPT_4(B0, B1, B2, B3, K4);
+ AES_DECRYPT_4(B0, B1, B2, B3, K5);
+ AES_DECRYPT_4(B0, B1, B2, B3, K6);
+ AES_DECRYPT_4(B0, B1, B2, B3, K7);
+ AES_DECRYPT_4(B0, B1, B2, B3, K8);
+ AES_DECRYPT_4(B0, B1, B2, B3, K9);
+ AES_DECRYPT_4_LAST(B0, B1, B2, B3, K10);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ Altivec64x2 B = load_block(in);
+
+ B = vec_xor(B, K0);
+ B = __builtin_crypto_vncipher(B, K1);
+ B = __builtin_crypto_vncipher(B, K2);
+ B = __builtin_crypto_vncipher(B, K3);
+ B = __builtin_crypto_vncipher(B, K4);
+ B = __builtin_crypto_vncipher(B, K5);
+ B = __builtin_crypto_vncipher(B, K6);
+ B = __builtin_crypto_vncipher(B, K7);
+ B = __builtin_crypto_vncipher(B, K8);
+ B = __builtin_crypto_vncipher(B, K9);
+ B = __builtin_crypto_vncipherlast(B, K10);
+
+ store_block(B, out);
+
+ out += 16;
+ in += 16;
+ }
+ }
+
+BOTAN_FUNC_ISA("crypto")
+void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const Altivec64x2 K0 = load_key(&m_EK[0]);
+ const Altivec64x2 K1 = load_key(&m_EK[4]);
+ const Altivec64x2 K2 = load_key(&m_EK[8]);
+ const Altivec64x2 K3 = load_key(&m_EK[12]);
+ const Altivec64x2 K4 = load_key(&m_EK[16]);
+ const Altivec64x2 K5 = load_key(&m_EK[20]);
+ const Altivec64x2 K6 = load_key(&m_EK[24]);
+ const Altivec64x2 K7 = load_key(&m_EK[28]);
+ const Altivec64x2 K8 = load_key(&m_EK[32]);
+ const Altivec64x2 K9 = load_key(&m_EK[36]);
+ const Altivec64x2 K10 = load_key(&m_EK[40]);
+ const Altivec64x2 K11 = load_key(&m_EK[44]);
+ const Altivec64x2 K12 = load_key(&m_EK[48]);
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K1);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K2);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K3);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K4);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K5);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K6);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K7);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K8);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K9);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K10);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K11);
+ AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K12);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ Altivec64x2 B = load_block(in);
+
+ B = vec_xor(B, K0);
+ B = __builtin_crypto_vcipher(B, K1);
+ B = __builtin_crypto_vcipher(B, K2);
+ B = __builtin_crypto_vcipher(B, K3);
+ B = __builtin_crypto_vcipher(B, K4);
+ B = __builtin_crypto_vcipher(B, K5);
+ B = __builtin_crypto_vcipher(B, K6);
+ B = __builtin_crypto_vcipher(B, K7);
+ B = __builtin_crypto_vcipher(B, K8);
+ B = __builtin_crypto_vcipher(B, K9);
+ B = __builtin_crypto_vcipher(B, K10);
+ B = __builtin_crypto_vcipher(B, K11);
+ B = __builtin_crypto_vcipherlast(B, K12);
+
+ store_block(B, out);
+
+ out += 16;
+ in += 16;
+ }
+ }
+
+BOTAN_FUNC_ISA("crypto")
+void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const Altivec64x2 K0 = load_key(&m_EK[48]);
+ const Altivec64x2 K1 = load_key(&m_EK[44]);
+ const Altivec64x2 K2 = load_key(&m_EK[40]);
+ const Altivec64x2 K3 = load_key(&m_EK[36]);
+ const Altivec64x2 K4 = load_key(&m_EK[32]);
+ const Altivec64x2 K5 = load_key(&m_EK[28]);
+ const Altivec64x2 K6 = load_key(&m_EK[24]);
+ const Altivec64x2 K7 = load_key(&m_EK[20]);
+ const Altivec64x2 K8 = load_key(&m_EK[16]);
+ const Altivec64x2 K9 = load_key(&m_EK[12]);
+ const Altivec64x2 K10 = load_key(&m_EK[8]);
+ const Altivec64x2 K11 = load_key(&m_EK[4]);
+ const Altivec64x2 K12 = load_key(&m_EK[0]);
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_DECRYPT_4(B0, B1, B2, B3, K1);
+ AES_DECRYPT_4(B0, B1, B2, B3, K2);
+ AES_DECRYPT_4(B0, B1, B2, B3, K3);
+ AES_DECRYPT_4(B0, B1, B2, B3, K4);
+ AES_DECRYPT_4(B0, B1, B2, B3, K5);
+ AES_DECRYPT_4(B0, B1, B2, B3, K6);
+ AES_DECRYPT_4(B0, B1, B2, B3, K7);
+ AES_DECRYPT_4(B0, B1, B2, B3, K8);
+ AES_DECRYPT_4(B0, B1, B2, B3, K9);
+ AES_DECRYPT_4(B0, B1, B2, B3, K10);
+ AES_DECRYPT_4(B0, B1, B2, B3, K11);
+ AES_DECRYPT_4_LAST(B0, B1, B2, B3, K12);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ Altivec64x2 B = load_block(in);
+
+ B = vec_xor(B, K0);
+ B = __builtin_crypto_vncipher(B, K1);
+ B = __builtin_crypto_vncipher(B, K2);
+ B = __builtin_crypto_vncipher(B, K3);
+ B = __builtin_crypto_vncipher(B, K4);
+ B = __builtin_crypto_vncipher(B, K5);
+ B = __builtin_crypto_vncipher(B, K6);
+ B = __builtin_crypto_vncipher(B, K7);
+ B = __builtin_crypto_vncipher(B, K8);
+ B = __builtin_crypto_vncipher(B, K9);
+ B = __builtin_crypto_vncipher(B, K10);
+ B = __builtin_crypto_vncipher(B, K11);
+ B = __builtin_crypto_vncipherlast(B, K12);
+
+ store_block(B, out);
+
+ out += 16;
+ in += 16;
+ }
+ }
+
+BOTAN_FUNC_ISA("crypto")
+void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const Altivec64x2 K0 = load_key(&m_EK[0]);
+ const Altivec64x2 K1 = load_key(&m_EK[4]);
+ const Altivec64x2 K2 = load_key(&m_EK[8]);
+ const Altivec64x2 K3 = load_key(&m_EK[12]);
+ const Altivec64x2 K4 = load_key(&m_EK[16]);
+ const Altivec64x2 K5 = load_key(&m_EK[20]);
+ const Altivec64x2 K6 = load_key(&m_EK[24]);
+ const Altivec64x2 K7 = load_key(&m_EK[28]);
+ const Altivec64x2 K8 = load_key(&m_EK[32]);
+ const Altivec64x2 K9 = load_key(&m_EK[36]);
+ const Altivec64x2 K10 = load_key(&m_EK[40]);
+ const Altivec64x2 K11 = load_key(&m_EK[44]);
+ const Altivec64x2 K12 = load_key(&m_EK[48]);
+ const Altivec64x2 K13 = load_key(&m_EK[52]);
+ const Altivec64x2 K14 = load_key(&m_EK[56]);
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K1);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K2);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K3);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K4);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K5);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K6);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K7);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K8);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K9);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K10);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K11);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K12);
+ AES_ENCRYPT_4(B0, B1, B2, B3, K13);
+ AES_ENCRYPT_4_LAST(B0, B1, B2, B3, K14);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ Altivec64x2 B = load_block(in);
+
+ B = vec_xor(B, K0);
+ B = __builtin_crypto_vcipher(B, K1);
+ B = __builtin_crypto_vcipher(B, K2);
+ B = __builtin_crypto_vcipher(B, K3);
+ B = __builtin_crypto_vcipher(B, K4);
+ B = __builtin_crypto_vcipher(B, K5);
+ B = __builtin_crypto_vcipher(B, K6);
+ B = __builtin_crypto_vcipher(B, K7);
+ B = __builtin_crypto_vcipher(B, K8);
+ B = __builtin_crypto_vcipher(B, K9);
+ B = __builtin_crypto_vcipher(B, K10);
+ B = __builtin_crypto_vcipher(B, K11);
+ B = __builtin_crypto_vcipher(B, K12);
+ B = __builtin_crypto_vcipher(B, K13);
+ B = __builtin_crypto_vcipherlast(B, K14);
+
+ store_block(B, out);
+
+ out += 16;
+ in += 16;
+ }
+ }
+
+BOTAN_FUNC_ISA("crypto")
+void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const Altivec64x2 K0 = load_key(&m_EK[56]);
+ const Altivec64x2 K1 = load_key(&m_EK[52]);
+ const Altivec64x2 K2 = load_key(&m_EK[48]);
+ const Altivec64x2 K3 = load_key(&m_EK[44]);
+ const Altivec64x2 K4 = load_key(&m_EK[40]);
+ const Altivec64x2 K5 = load_key(&m_EK[36]);
+ const Altivec64x2 K6 = load_key(&m_EK[32]);
+ const Altivec64x2 K7 = load_key(&m_EK[28]);
+ const Altivec64x2 K8 = load_key(&m_EK[24]);
+ const Altivec64x2 K9 = load_key(&m_EK[20]);
+ const Altivec64x2 K10 = load_key(&m_EK[16]);
+ const Altivec64x2 K11 = load_key(&m_EK[12]);
+ const Altivec64x2 K12 = load_key(&m_EK[8]);
+ const Altivec64x2 K13 = load_key(&m_EK[4]);
+ const Altivec64x2 K14 = load_key(&m_EK[0]);
+
+ while(blocks >= 4)
+ {
+ Altivec64x2 B0 = load_block(in);
+ Altivec64x2 B1 = load_block(in+16);
+ Altivec64x2 B2 = load_block(in+16*2);
+ Altivec64x2 B3 = load_block(in+16*3);
+
+ AES_XOR_4(B0, B1, B2, B3, K0);
+ AES_DECRYPT_4(B0, B1, B2, B3, K1);
+ AES_DECRYPT_4(B0, B1, B2, B3, K2);
+ AES_DECRYPT_4(B0, B1, B2, B3, K3);
+ AES_DECRYPT_4(B0, B1, B2, B3, K4);
+ AES_DECRYPT_4(B0, B1, B2, B3, K5);
+ AES_DECRYPT_4(B0, B1, B2, B3, K6);
+ AES_DECRYPT_4(B0, B1, B2, B3, K7);
+ AES_DECRYPT_4(B0, B1, B2, B3, K8);
+ AES_DECRYPT_4(B0, B1, B2, B3, K9);
+ AES_DECRYPT_4(B0, B1, B2, B3, K10);
+ AES_DECRYPT_4(B0, B1, B2, B3, K11);
+ AES_DECRYPT_4(B0, B1, B2, B3, K12);
+ AES_DECRYPT_4(B0, B1, B2, B3, K13);
+ AES_DECRYPT_4_LAST(B0, B1, B2, B3, K14);
+
+ store_blocks(B0, B1, B2, B3, out);
+
+ out += 4*16;
+ in += 4*16;
+ blocks -= 4;
+ }
+
+ for(size_t i = 0; i != blocks; ++i)
+ {
+ Altivec64x2 B = load_block(in);
+
+ B = vec_xor(B, K0);
+ B = __builtin_crypto_vncipher(B, K1);
+ B = __builtin_crypto_vncipher(B, K2);
+ B = __builtin_crypto_vncipher(B, K3);
+ B = __builtin_crypto_vncipher(B, K4);
+ B = __builtin_crypto_vncipher(B, K5);
+ B = __builtin_crypto_vncipher(B, K6);
+ B = __builtin_crypto_vncipher(B, K7);
+ B = __builtin_crypto_vncipher(B, K8);
+ B = __builtin_crypto_vncipher(B, K9);
+ B = __builtin_crypto_vncipher(B, K10);
+ B = __builtin_crypto_vncipher(B, K11);
+ B = __builtin_crypto_vncipher(B, K12);
+ B = __builtin_crypto_vncipher(B, K13);
+ B = __builtin_crypto_vncipherlast(B, K14);
+
+ store_block(B, out);
+
+ out += 16;
+ in += 16;
+ }
+ }
+
+#undef AES_XOR_4
+#undef AES_ENCRYPT_4
+#undef AES_ENCRYPT_4_LAST
+#undef AES_DECRYPT_4
+#undef AES_DECRYPT_4_LAST
+
+}
diff --git a/comm/third_party/botan/src/lib/block/aes/aes_power8/info.txt b/comm/third_party/botan/src/lib/block/aes/aes_power8/info.txt
new file mode 100644
index 0000000000..df569edd50
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes_power8/info.txt
@@ -0,0 +1,11 @@
+<defines>
+AES_POWER8 -> 20180223
+</defines>
+
+<arch>
+ppc64
+</arch>
+
+<isa>
+powercrypto
+</isa>
diff --git a/comm/third_party/botan/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/comm/third_party/botan/src/lib/block/aes/aes_vperm/aes_vperm.cpp
new file mode 100644
index 0000000000..4ae6bb2236
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes_vperm/aes_vperm.cpp
@@ -0,0 +1,627 @@
+/*
+* AES using vector permutes (SSSE3, NEON)
+* (C) 2010,2016,2019 Jack Lloyd
+*
+* Based on public domain x86-64 assembly written by Mike Hamburg,
+* described in "Accelerating AES with Vector Permute Instructions"
+* (CHES 2009). His original code is available at
+* https://crypto.stanford.edu/vpaes/
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/aes.h>
+#include <botan/internal/ct_utils.h>
+#include <botan/internal/simd_32.h>
+
+#if defined(BOTAN_SIMD_USE_SSE2)
+ #include <tmmintrin.h>
+#endif
+
+namespace Botan {
+
+namespace {
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) shuffle(SIMD_4x32 a, SIMD_4x32 b)
+ {
+#if defined(BOTAN_SIMD_USE_SSE2)
+ return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw()));
+#elif defined(BOTAN_SIMD_USE_NEON)
+ const uint8x16_t tbl = vreinterpretq_u8_u32(a.raw());
+ const uint8x16_t idx = vreinterpretq_u8_u32(b.raw());
+
+#if defined(BOTAN_TARGET_ARCH_IS_ARM32)
+ const uint8x8x2_t tbl2 = { vget_low_u8(tbl), vget_high_u8(tbl) };
+
+ return SIMD_4x32(vreinterpretq_u32_u8(
+ vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx)),
+ vtbl2_u8(tbl2, vget_high_u8(idx)))));
+
+#else
+ return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(tbl, idx)));
+#endif
+
+#elif defined(BOTAN_SIMD_USE_ALTIVEC)
+
+ const auto zero = vec_splat_s8(0x00);
+ const auto mask = vec_cmplt((__vector signed char)b.raw(), zero);
+ const auto r = vec_perm((__vector signed char)a.raw(), (__vector signed char)a.raw(), (__vector unsigned char)b.raw());
+ return SIMD_4x32((__vector unsigned int)vec_sel(r, zero, mask));
+
+#else
+ #error "No shuffle implementation available"
+#endif
+ }
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) alignr8(SIMD_4x32 a, SIMD_4x32 b)
+ {
+#if defined(BOTAN_SIMD_USE_SSE2)
+ return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8));
+#elif defined(BOTAN_SIMD_USE_NEON)
+ return SIMD_4x32(vextq_u32(b.raw(), a.raw(), 2));
+#elif defined(BOTAN_SIMD_USE_ALTIVEC)
+ const __vector unsigned char mask = {8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23};
+ return SIMD_4x32(vec_perm(b.raw(), a.raw(), mask));
+#else
+ #error "No alignr8 implementation available"
+#endif
+ }
+
+const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090);
+const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC);
+
+const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309);
+const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C);
+
+const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E);
+const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1);
+const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
+const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
+
+const SIMD_4x32 sboud = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
+const SIMD_4x32 sbotd = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
+
+const SIMD_4x32 mc_forward[4] = {
+ SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
+ SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
+ SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
+ SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)
+};
+
+const SIMD_4x32 vperm_sr[4] = {
+ SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C),
+ SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C),
+ SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C),
+ SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
+};
+
+const SIMD_4x32 rcon[10] = {
+ SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000),
+ SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000),
+ SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000),
+ SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
+ SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000),
+ SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000),
+ SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000),
+ SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000),
+ SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000),
+ SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
+};
+
+const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955);
+const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8);
+
+const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E);
+const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772);
+
+const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50);
+const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E);
+
+const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004);
+const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B);
+
+const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13);
+const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D);
+
+const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6);
+const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E);
+
+const SIMD_4x32 mcx[4] = {
+ SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09),
+ SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
+ SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
+ SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
+};
+
+const SIMD_4x32 mc_backward[4] = {
+ SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
+ SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
+ SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407),
+ SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
+};
+
+const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
+
+inline SIMD_4x32 low_nibs(SIMD_4x32 x)
+ {
+ return lo_nibs_mask & x;
+ }
+
+inline SIMD_4x32 high_nibs(SIMD_4x32 x)
+ {
+ return (x.shr<4>() & lo_nibs_mask);
+ }
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K)
+ {
+ return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K;
+ }
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+ {
+ const SIMD_4x32 Bh = high_nibs(B);
+ SIMD_4x32 Bl = low_nibs(B);
+ const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
+ Bl ^= Bh;
+
+ const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+ const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
+
+ const SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K;
+ const SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
+
+ return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
+ }
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+ {
+ const SIMD_4x32 Bh = high_nibs(B);
+ SIMD_4x32 Bl = low_nibs(B);
+ const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
+ Bl ^= Bh;
+
+ const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+ const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
+
+ return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, vperm_sr[r % 4]);
+ }
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K)
+ {
+ return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K;
+ }
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+ {
+ const SIMD_4x32 Bh = high_nibs(B);
+ B = low_nibs(B);
+ const SIMD_4x32 t2 = shuffle(k_inv2, B);
+
+ B ^= Bh;
+
+ const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+ const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
+
+ const SIMD_4x32 mc = mcx[(r-1)%4];
+
+ const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K;
+ const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6);
+ const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6);
+ return shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6);
+ }
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+ {
+ const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16;
+
+ const SIMD_4x32 Bh = high_nibs(B);
+ B = low_nibs(B);
+ const SIMD_4x32 t2 = shuffle(k_inv2, B);
+
+ B ^= Bh;
+
+ const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+ const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
+
+ const SIMD_4x32 x = shuffle(sboud, t5) ^ shuffle(sbotd, t6) ^ K;
+ return shuffle(x, vperm_sr[which_sr]);
+ }
+
+void BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
+ vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
+ const SIMD_4x32 K[], size_t rounds)
+ {
+ CT::poison(in, blocks * 16);
+
+ const size_t blocks2 = blocks - (blocks % 2);
+
+ for(size_t i = 0; i != blocks2; i += 2)
+ {
+ SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
+ SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
+
+ B0 = aes_enc_first_round(B0, K[0]);
+ B1 = aes_enc_first_round(B1, K[0]);
+
+ for(size_t r = 1; r != rounds; ++r)
+ {
+ B0 = aes_enc_round(B0, K[r], r);
+ B1 = aes_enc_round(B1, K[r], r);
+ }
+
+ B0 = aes_enc_last_round(B0, K[rounds], rounds);
+ B1 = aes_enc_last_round(B1, K[rounds], rounds);
+
+ B0.store_le(out + i*16);
+ B1.store_le(out + (i+1)*16);
+ }
+
+ for(size_t i = blocks2; i < blocks; ++i)
+ {
+ SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
+
+ B = aes_enc_first_round(B, K[0]);
+
+ for(size_t r = 1; r != rounds; ++r)
+ {
+ B = aes_enc_round(B, K[r], r);
+ }
+
+ B = aes_enc_last_round(B, K[rounds], rounds);
+ B.store_le(out + i*16);
+ }
+
+ CT::unpoison(in, blocks * 16);
+ CT::unpoison(out, blocks * 16);
+ }
+
+void BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
+ vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
+ const SIMD_4x32 K[], size_t rounds)
+ {
+ CT::poison(in, blocks * 16);
+
+ const size_t blocks2 = blocks - (blocks % 2);
+
+ for(size_t i = 0; i != blocks2; i += 2)
+ {
+ SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
+ SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
+
+ B0 = aes_dec_first_round(B0, K[0]);
+ B1 = aes_dec_first_round(B1, K[0]);
+
+ for(size_t r = 1; r != rounds; ++r)
+ {
+ B0 = aes_dec_round(B0, K[r], r);
+ B1 = aes_dec_round(B1, K[r], r);
+ }
+
+ B0 = aes_dec_last_round(B0, K[rounds], rounds);
+ B1 = aes_dec_last_round(B1, K[rounds], rounds);
+
+ B0.store_le(out + i*16);
+ B1.store_le(out + (i+1)*16);
+ }
+
+ for(size_t i = blocks2; i < blocks; ++i)
+ {
+ SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
+
+ B = aes_dec_first_round(B, K[0]);
+
+ for(size_t r = 1; r != rounds; ++r)
+ {
+ B = aes_dec_round(B, K[r], r);
+ }
+
+ B = aes_dec_last_round(B, K[rounds], rounds);
+ B.store_le(out + i*16);
+ }
+
+ CT::unpoison(in, blocks * 16);
+ CT::unpoison(out, blocks * 16);
+ }
+
+}
+
+void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const SIMD_4x32 K[11] = {
+ SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
+ SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
+ SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
+ SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]),
+ };
+
+ return vperm_encrypt_blocks(in, out, blocks, K, 10);
+ }
+
+void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const SIMD_4x32 K[11] = {
+ SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
+ SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
+ SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
+ SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]),
+ };
+
+ return vperm_decrypt_blocks(in, out, blocks, K, 10);
+ }
+
+void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const SIMD_4x32 K[13] = {
+ SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
+ SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
+ SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
+ SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
+ SIMD_4x32(&m_EK[4*12]),
+ };
+
+ return vperm_encrypt_blocks(in, out, blocks, K, 12);
+ }
+
+void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const SIMD_4x32 K[13] = {
+ SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
+ SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
+ SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
+ SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
+ SIMD_4x32(&m_DK[4*12]),
+ };
+
+ return vperm_decrypt_blocks(in, out, blocks, K, 12);
+ }
+
+void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const SIMD_4x32 K[15] = {
+ SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
+ SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
+ SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
+ SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
+ SIMD_4x32(&m_EK[4*12]), SIMD_4x32(&m_EK[4*13]), SIMD_4x32(&m_EK[4*14]),
+ };
+
+ return vperm_encrypt_blocks(in, out, blocks, K, 14);
+ }
+
+void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+ {
+ const SIMD_4x32 K[15] = {
+ SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
+ SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
+ SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
+ SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
+ SIMD_4x32(&m_DK[4*12]), SIMD_4x32(&m_DK[4*13]), SIMD_4x32(&m_DK[4*14]),
+ };
+
+ return vperm_decrypt_blocks(in, out, blocks, K, 14);
+ }
+
+namespace {
+
+inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
+ aes_schedule_transform(SIMD_4x32 input,
+ SIMD_4x32 table_1,
+ SIMD_4x32 table_2)
+ {
+ return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input));
+ }
+
+SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no)
+ {
+ const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
+
+ SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
+ SIMD_4x32 t2 = t;
+ t = shuffle(t, mc_forward0);
+ t2 = t ^ t2 ^ shuffle(t, mc_forward0);
+ return shuffle(t2, vperm_sr[round_no % 4]);
+ }
+
+SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no)
+ {
+ const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
+
+ const SIMD_4x32 dsk[8] = {
+ SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334),
+ SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC),
+ SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A),
+ SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C),
+ SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9),
+ SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D),
+ SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3),
+ SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4),
+ };
+
+ SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]);
+ SIMD_4x32 output = shuffle(t, mc_forward0);
+
+ t = aes_schedule_transform(t, dsk[2], dsk[3]);
+ output = shuffle(t ^ output, mc_forward0);
+
+ t = aes_schedule_transform(t, dsk[4], dsk[5]);
+ output = shuffle(t ^ output, mc_forward0);
+
+ t = aes_schedule_transform(t, dsk[6], dsk[7]);
+ output = shuffle(t ^ output, mc_forward0);
+
+ return shuffle(output, vperm_sr[round_no % 4]);
+ }
+
+SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no)
+ {
+ const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121);
+ const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
+
+ k = shuffle(k, vperm_sr[round_no % 4]);
+ k ^= SIMD_4x32::splat_u8(0x5B);
+ return aes_schedule_transform(k, out_tr1, out_tr2);
+ }
+
+SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_last_dec(SIMD_4x32 k)
+ {
+ const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
+ const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
+
+ k ^= SIMD_4x32::splat_u8(0x5B);
+ return aes_schedule_transform(k, deskew1, deskew2);
+ }
+
+SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
+ {
+ SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>();
+ smeared ^= smeared.shift_elems_left<2>();
+ smeared ^= SIMD_4x32::splat_u8(0x5B);
+
+ const SIMD_4x32 Bh = high_nibs(input1);
+ SIMD_4x32 Bl = low_nibs(input1);
+
+ const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
+
+ Bl ^= Bh;
+
+ SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+ SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
+
+ return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
+ }
+
+SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2)
+ {
+ // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3)));
+ const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D);
+ return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc);
+ }
+
+SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y)
+ {
+ const SIMD_4x32 shuffle3332 =
+ SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C);
+ const SIMD_4x32 shuffle2000 =
+ SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908);
+
+ const SIMD_4x32 zero_top_half(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
+ y &= zero_top_half;
+ return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000);
+ }
+
+}
+
+void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t)
+ {
+ m_EK.resize(11*4);
+ m_DK.resize(11*4);
+
+ SIMD_4x32 key = SIMD_4x32::load_le(keyb);
+
+ shuffle(key, vperm_sr[2]).store_le(&m_DK[4*10]);
+
+ key = aes_schedule_transform(key, k_ipt1, k_ipt2);
+ key.store_le(&m_EK[0]);
+
+ for(size_t i = 1; i != 10; ++i)
+ {
+ key = aes_schedule_round(rcon[i-1], key, key);
+
+ aes_schedule_mangle(key, (12-i) % 4).store_le(&m_EK[4*i]);
+
+ aes_schedule_mangle_dec(key, (10-i)%4).store_le(&m_DK[4*(10-i)]);
+ }
+
+ key = aes_schedule_round(rcon[9], key, key);
+ aes_schedule_mangle_last(key, 2).store_le(&m_EK[4*10]);
+ aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
+ }
+
+void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t)
+ {
+ m_EK.resize(13*4);
+ m_DK.resize(13*4);
+
+ SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
+ SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8);
+
+ shuffle(key1, vperm_sr[0]).store_le(&m_DK[12*4]);
+
+ key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
+ key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
+
+ key1.store_le(&m_EK[0]);
+
+ for(size_t i = 0; i != 4; ++i)
+ {
+ // key2 with 8 high bytes masked off
+ SIMD_4x32 t = key2;
+ key2 = aes_schedule_round(rcon[2*i], key2, key1);
+ const SIMD_4x32 key2t = alignr8(key2, t);
+ aes_schedule_mangle(key2t, (i+3)%4).store_le(&m_EK[4*(3*i+1)]);
+ aes_schedule_mangle_dec(key2t, (i+3)%4).store_le(&m_DK[4*(11-3*i)]);
+
+ t = aes_schedule_192_smear(key2, t);
+
+ aes_schedule_mangle(t, (i+2)%4).store_le(&m_EK[4*(3*i+2)]);
+ aes_schedule_mangle_dec(t, (i+2)%4).store_le(&m_DK[4*(10-3*i)]);
+
+ key2 = aes_schedule_round(rcon[2*i+1], t, key2);
+
+ if(i == 3)
+ {
+ aes_schedule_mangle_last(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
+ aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4*(9-3*i)]);
+ }
+ else
+ {
+ aes_schedule_mangle(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
+ aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(9-3*i)]);
+ }
+
+ key1 = key2;
+ key2 = aes_schedule_192_smear(key2, t);
+ }
+ }
+
+void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t)
+ {
+ m_EK.resize(15*4);
+ m_DK.resize(15*4);
+
+ SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
+ SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16);
+
+ shuffle(key1, vperm_sr[2]).store_le(&m_DK[4*14]);
+
+ key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
+ key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
+
+ key1.store_le(&m_EK[0]);
+ aes_schedule_mangle(key2, 3).store_le(&m_EK[4]);
+
+ aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4*13]);
+
+ const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C);
+
+ for(size_t i = 2; i != 14; i += 2)
+ {
+ const SIMD_4x32 k_t = key2;
+ key1 = key2 = aes_schedule_round(rcon[(i/2)-1], key2, key1);
+
+ aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]);
+ aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]);
+
+ key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t);
+
+ aes_schedule_mangle(key2, (i-1)%4).store_le(&m_EK[4*(i+1)]);
+ aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]);
+ }
+
+ key2 = aes_schedule_round(rcon[6], key2, key1);
+
+ aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4*14]);
+ aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
+ }
+
+}
diff --git a/comm/third_party/botan/src/lib/block/aes/aes_vperm/info.txt b/comm/third_party/botan/src/lib/block/aes/aes_vperm/info.txt
new file mode 100644
index 0000000000..0b7eabaace
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/aes_vperm/info.txt
@@ -0,0 +1,36 @@
+<defines>
+AES_VPERM -> 20190901
+</defines>
+
+endian little
+
+<isa>
+x86_32:sse2
+x86_64:sse2
+x86_32:ssse3
+x86_64:ssse3
+arm32:neon
+arm64:neon
+ppc32:altivec
+ppc64:altivec
+</isa>
+
+<arch>
+x86_32
+x86_64
+arm32
+arm64
+ppc32
+ppc64
+</arch>
+
+<requires>
+simd
+</requires>
+
+<cc>
+gcc
+clang
+msvc:19.10 # VC 2017
+sunstudio
+</cc>
diff --git a/comm/third_party/botan/src/lib/block/aes/info.txt b/comm/third_party/botan/src/lib/block/aes/info.txt
new file mode 100644
index 0000000000..62455cf2c3
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/aes/info.txt
@@ -0,0 +1,3 @@
+<defines>
+AES -> 20131128
+</defines>