summaryrefslogtreecommitdiffstats
path: root/grub-core/lib/libgcrypt-grub/cipher/rijndael.c
diff options
context:
space:
mode:
Diffstat (limited to 'grub-core/lib/libgcrypt-grub/cipher/rijndael.c')
-rw-r--r--grub-core/lib/libgcrypt-grub/cipher/rijndael.c1478
1 files changed, 1478 insertions, 0 deletions
diff --git a/grub-core/lib/libgcrypt-grub/cipher/rijndael.c b/grub-core/lib/libgcrypt-grub/cipher/rijndael.c
new file mode 100644
index 0000000..92e4d2c
--- /dev/null
+++ b/grub-core/lib/libgcrypt-grub/cipher/rijndael.c
@@ -0,0 +1,1478 @@
+/* This file was automatically imported with
+ import_gcry.py. Please don't modify it */
+#include <grub/dl.h>
+GRUB_MOD_LICENSE ("GPLv3+");
+/* Rijndael (AES) for GnuPG
+ * Copyright (C) 2000, 2001, 2002, 2003, 2007,
+ * 2008, 2011 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *******************************************************************
+ * The code here is based on the optimized implementation taken from
+ * http://www.esat.kuleuven.ac.be/~rijmen/rijndael/ on Oct 2, 2000,
+ * which carries this notice:
+ *------------------------------------------
+ * rijndael-alg-fst.c v2.3 April '2000
+ *
+ * Optimised ANSI C code
+ *
+ * authors: v1.0: Antoon Bosselaers
+ * v2.0: Vincent Rijmen
+ * v2.3: Paulo Barreto
+ *
+ * This code is placed in the public domain.
+ *------------------------------------------
+ *
+ * The SP800-38a document is available at:
+ * http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
+ *
+ */
+
+
+#include "types.h" /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+
+#define MAXKC (256/32)
+#define MAXROUNDS 14
+#define BLOCKSIZE (128/8)
+
+
+/* Helper macro to force alignment to 16 bytes. */
+#ifdef __GNUC__
+# define ATTR_ALIGNED_16 __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+
+/* USE_PADLOCK indicates whether to compile the padlock specific
+ code. */
+#undef USE_PADLOCK
+#ifdef ENABLE_PADLOCK_SUPPORT
+# if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && defined (__GNUC__)
+# define USE_PADLOCK 1
+# endif
+#endif /*ENABLE_PADLOCK_SUPPORT*/
+
+/* USE_AESNI inidicates whether to compile with Intel AES-NI code. We
+ need the vector-size attribute which seems to be available since
+ gcc 3. However, to be on the safe side we require at least gcc 4. */
+#undef USE_AESNI
+#ifdef ENABLE_AESNI_SUPPORT
+# if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4
+# define USE_AESNI 1
+# endif
+#endif /* ENABLE_AESNI_SUPPORT */
+
+#ifdef USE_AESNI
+ typedef int m128i_t __attribute__ ((__vector_size__ (16)));
+#endif /*USE_AESNI*/
+
+/* Define an u32 variant for the sake of gcc 4.4's strict aliasing. */
+#if __GNUC__ > 4 || ( __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )
+typedef u32 __attribute__ ((__may_alias__)) u32_a_t;
+#else
+typedef u32 u32_a_t;
+#endif
+
+
+
+/* Our context object. */
+typedef struct
+{
+ /* The first fields are the keyschedule arrays. This is so that
+ they are aligned on a 16 byte boundary if using gcc. This
+ alignment is required for the AES-NI code and a good idea in any
+ case. The alignment is guaranteed due to the way cipher.c
+ allocates the space for the context. The PROPERLY_ALIGNED_TYPE
+ hack is used to force a minimal alignment if not using gcc of if
+ the alignment requirement is higher that 16 bytes. */
+ union
+ {
+ PROPERLY_ALIGNED_TYPE dummy;
+ byte keyschedule[MAXROUNDS+1][4][4];
+#ifdef USE_PADLOCK
+ /* The key as passed to the padlock engine. It is only used if
+ the padlock engine is used (USE_PADLOCK, below). */
+ unsigned char padlock_key[16] __attribute__ ((aligned (16)));
+#endif /*USE_PADLOCK*/
+ } u1;
+ union
+ {
+ PROPERLY_ALIGNED_TYPE dummy;
+ byte keyschedule[MAXROUNDS+1][4][4];
+ } u2;
+ int rounds; /* Key-length-dependent number of rounds. */
+ int decryption_prepared; /* The decryption key schedule is available. */
+#ifdef USE_PADLOCK
+ int use_padlock; /* Padlock shall be used. */
+#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+ int use_aesni; /* AES-NI shall be used. */
+#endif /*USE_AESNI*/
+} RIJNDAEL_context ATTR_ALIGNED_16;
+
+/* Macros defining alias for the keyschedules. */
+#define keyschenc u1.keyschedule
+#define keyschdec u2.keyschedule
+#define padlockkey u1.padlock_key
+
+/* Two macros to be called prior and after the use of AESNI
+ instructions. There should be no external function calls between
+ the use of these macros. There purpose is to make sure that the
+ SSE regsiters are cleared and won't reveal any information about
+ the key or the data. */
+#ifdef USE_AESNI
+# define aesni_prepare() do { } while (0)
+# define aesni_cleanup() \
+ do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \
+ "pxor %%xmm1, %%xmm1\n" :: ); \
+ } while (0)
+# define aesni_cleanup_2_4() \
+ do { asm volatile ("pxor %%xmm2, %%xmm2\n\t" \
+ "pxor %%xmm3, %%xmm3\n" \
+ "pxor %%xmm4, %%xmm4\n":: ); \
+ } while (0)
+#else
+# define aesni_prepare() do { } while (0)
+# define aesni_cleanup() do { } while (0)
+#endif
+
+
+/* All the numbers. */
+#include "rijndael-tables.h"
+
+
+
+/* Function prototypes. */
+#ifdef USE_AESNI
+/* We don't want to inline these functions to help gcc allocate enough
+ registers. */
+static void do_aesni_ctr (const RIJNDAEL_context *ctx, unsigned char *ctr,
+ unsigned char *b, const unsigned char *a)
+ __attribute__ ((__noinline__));
+static void do_aesni_ctr_4 (const RIJNDAEL_context *ctx, unsigned char *ctr,
+ unsigned char *b, const unsigned char *a)
+ __attribute__ ((__noinline__));
+#endif /*USE_AESNI*/
+
+
+
+
+/* Perform the key setup. */
+static gcry_err_code_t
+do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
+{
+ static int initialized = 0;
+ static const char *selftest_failed=0;
+ int rounds;
+ int i,j, r, t, rconpointer = 0;
+ int KC;
+ union
+ {
+ PROPERLY_ALIGNED_TYPE dummy;
+ byte k[MAXKC][4];
+ } k;
+#define k k.k
+ union
+ {
+ PROPERLY_ALIGNED_TYPE dummy;
+ byte tk[MAXKC][4];
+ } tk;
+#define tk tk.tk
+
+ /* The on-the-fly self tests are only run in non-fips mode. In fips
+ mode explicit self-tests are required. Actually the on-the-fly
+ self-tests are not fully thread-safe and it might happen that a
+ failed self-test won't get noticed in another thread.
+
+ FIXME: We might want to have a central registry of succeeded
+ self-tests. */
+ if (!fips_mode () && !initialized)
+ {
+ initialized = 1;
+ selftest_failed = selftest ();
+ if (selftest_failed)
+ log_error ("%s\n", selftest_failed );
+ }
+ if (selftest_failed)
+ return GPG_ERR_SELFTEST_FAILED;
+
+ ctx->decryption_prepared = 0;
+#ifdef USE_PADLOCK
+ ctx->use_padlock = 0;
+#endif
+#ifdef USE_AESNI
+ ctx->use_aesni = 0;
+#endif
+
+ if( keylen == 128/8 )
+ {
+ rounds = 10;
+ KC = 4;
+
+ if (0)
+ ;
+#ifdef USE_PADLOCK
+ else if ((_gcry_get_hw_features () & HWF_PADLOCK_AES))
+ {
+ ctx->use_padlock = 1;
+ memcpy (ctx->padlockkey, key, keylen);
+ }
+#endif
+#ifdef USE_AESNI
+ else if ((_gcry_get_hw_features () & HWF_INTEL_AESNI))
+ {
+ ctx->use_aesni = 1;
+ }
+#endif
+ }
+ else if ( keylen == 192/8 )
+ {
+ rounds = 12;
+ KC = 6;
+
+ if (0)
+ {
+ ;
+ }
+#ifdef USE_AESNI
+ else if ((_gcry_get_hw_features () & HWF_INTEL_AESNI))
+ {
+ ctx->use_aesni = 1;
+ }
+#endif
+ }
+ else if ( keylen == 256/8 )
+ {
+ rounds = 14;
+ KC = 8;
+
+ if (0)
+ {
+ ;
+ }
+#ifdef USE_AESNI
+ else if ((_gcry_get_hw_features () & HWF_INTEL_AESNI))
+ {
+ ctx->use_aesni = 1;
+ }
+#endif
+ }
+ else
+ return GPG_ERR_INV_KEYLEN;
+
+ ctx->rounds = rounds;
+
+ /* NB: We don't yet support Padlock hardware key generation. */
+
+ if (0)
+ ;
+#ifdef USE_AESNI_is_disabled_here
+ else if (ctx->use_aesni && ctx->rounds == 10)
+ {
+ /* Note: This code works for AES-128 but it is not much better
+ than using the standard key schedule. We disable it for
+ now and don't put any effort into implementing this for
+ AES-192 and AES-256. */
+ asm volatile ("movl %[key], %%esi\n\t"
+ "movdqu (%%esi), %%xmm1\n\t" /* xmm1 := key */
+ "movl %[ksch], %%esi\n\t"
+ "movdqa %%xmm1, (%%esi)\n\t" /* ksch[0] := xmm1 */
+ "aeskeygenassist $0x01, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0x10(%%esi)\n\t" /* ksch[1] := xmm1 */
+ "aeskeygenassist $0x02, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0x20(%%esi)\n\t" /* ksch[2] := xmm1 */
+ "aeskeygenassist $0x04, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0x30(%%esi)\n\t" /* ksch[3] := xmm1 */
+ "aeskeygenassist $0x08, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0x40(%%esi)\n\t" /* ksch[4] := xmm1 */
+ "aeskeygenassist $0x10, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0x50(%%esi)\n\t" /* ksch[5] := xmm1 */
+ "aeskeygenassist $0x20, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0x60(%%esi)\n\t" /* ksch[6] := xmm1 */
+ "aeskeygenassist $0x40, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0x70(%%esi)\n\t" /* ksch[7] := xmm1 */
+ "aeskeygenassist $0x80, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0x80(%%esi)\n\t" /* ksch[8] := xmm1 */
+ "aeskeygenassist $0x1b, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0x90(%%esi)\n\t" /* ksch[9] := xmm1 */
+ "aeskeygenassist $0x36, %%xmm1, %%xmm2\n\t"
+ "call .Lexpand128_%=\n\t"
+ "movdqa %%xmm1, 0xa0(%%esi)\n\t" /* ksch[10] := xmm1 */
+ "jmp .Lleave%=\n"
+
+ ".Lexpand128_%=:\n\t"
+ "pshufd $0xff, %%xmm2, %%xmm2\n\t"
+ "movdqa %%xmm1, %%xmm3\n\t"
+ "pslldq $4, %%xmm3\n\t"
+ "pxor %%xmm3, %%xmm1\n\t"
+ "pslldq $4, %%xmm3\n\t"
+ "pxor %%xmm3, %%xmm1\n\t"
+ "pslldq $4, %%xmm3\n\t"
+ "pxor %%xmm3, %%xmm2\n\t"
+ "pxor %%xmm2, %%xmm1\n\t"
+ "ret\n"
+
+ ".Lleave%=:\n\t"
+ "pxor %%xmm1, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm2\n\t"
+ "pxor %%xmm3, %%xmm3\n"
+ :
+ : [key] "g" (key), [ksch] "g" (ctx->keyschenc)
+ : "%esi", "cc", "memory" );
+ }
+#endif /*USE_AESNI*/
+ else
+ {
+#define W (ctx->keyschenc)
+ for (i = 0; i < keylen; i++)
+ {
+ k[i >> 2][i & 3] = key[i];
+ }
+
+ for (j = KC-1; j >= 0; j--)
+ {
+ *((u32_a_t*)tk[j]) = *((u32_a_t*)k[j]);
+ }
+ r = 0;
+ t = 0;
+ /* Copy values into round key array. */
+ for (j = 0; (j < KC) && (r < rounds + 1); )
+ {
+ for (; (j < KC) && (t < 4); j++, t++)
+ {
+ *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]);
+ }
+ if (t == 4)
+ {
+ r++;
+ t = 0;
+ }
+ }
+
+ while (r < rounds + 1)
+ {
+ /* While not enough round key material calculated calculate
+ new values. */
+ tk[0][0] ^= S[tk[KC-1][1]];
+ tk[0][1] ^= S[tk[KC-1][2]];
+ tk[0][2] ^= S[tk[KC-1][3]];
+ tk[0][3] ^= S[tk[KC-1][0]];
+ tk[0][0] ^= rcon[rconpointer++];
+
+ if (KC != 8)
+ {
+ for (j = 1; j < KC; j++)
+ {
+ *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
+ }
+ }
+ else
+ {
+ for (j = 1; j < KC/2; j++)
+ {
+ *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
+ }
+ tk[KC/2][0] ^= S[tk[KC/2 - 1][0]];
+ tk[KC/2][1] ^= S[tk[KC/2 - 1][1]];
+ tk[KC/2][2] ^= S[tk[KC/2 - 1][2]];
+ tk[KC/2][3] ^= S[tk[KC/2 - 1][3]];
+ for (j = KC/2 + 1; j < KC; j++)
+ {
+ *((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
+ }
+ }
+
+ /* Copy values into round key array. */
+ for (j = 0; (j < KC) && (r < rounds + 1); )
+ {
+ for (; (j < KC) && (t < 4); j++, t++)
+ {
+ *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]);
+ }
+ if (t == 4)
+ {
+ r++;
+ t = 0;
+ }
+ }
+ }
+#undef W
+ }
+
+ return 0;
+#undef tk
+#undef k
+}
+
+
+static gcry_err_code_t
+rijndael_setkey (void *context, const byte *key, const unsigned keylen)
+{
+ RIJNDAEL_context *ctx = context;
+
+ int rc = do_setkey (ctx, key, keylen);
+ _gcry_burn_stack ( 100 + 16*sizeof(int));
+ return rc;
+}
+
+
+/* Make a decryption key from an encryption key. */
+static void
+prepare_decryption( RIJNDAEL_context *ctx )
+{
+ int r;
+
+#ifdef USE_AESNI
+ if (ctx->use_aesni)
+ {
+ /* The AES-NI decrypt instructions use the Equivalent Inverse
+ Cipher, thus we can't use the the standard decrypt key
+ preparation. */
+ m128i_t *ekey = (m128i_t*)ctx->keyschenc;
+ m128i_t *dkey = (m128i_t*)ctx->keyschdec;
+ int rr;
+
+ dkey[0] = ekey[ctx->rounds];
+ for (r=1, rr=ctx->rounds-1; r < ctx->rounds; r++, rr--)
+ {
+ asm volatile
+ ("movdqu %[ekey], %%xmm1\n\t"
+ /*"aesimc %%xmm1, %%xmm1\n\t"*/
+ ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t"
+ "movdqu %%xmm1, %[dkey]"
+ : [dkey] "=m" (dkey[r])
+ : [ekey] "m" (ekey[rr]) );
+ }
+ dkey[r] = ekey[0];
+ }
+ else
+#endif /*USE_AESNI*/
+ {
+ union
+ {
+ PROPERLY_ALIGNED_TYPE dummy;
+ byte *w;
+ } w;
+#define w w.w
+
+ for (r=0; r < MAXROUNDS+1; r++ )
+ {
+ *((u32_a_t*)ctx->keyschdec[r][0]) = *((u32_a_t*)ctx->keyschenc[r][0]);
+ *((u32_a_t*)ctx->keyschdec[r][1]) = *((u32_a_t*)ctx->keyschenc[r][1]);
+ *((u32_a_t*)ctx->keyschdec[r][2]) = *((u32_a_t*)ctx->keyschenc[r][2]);
+ *((u32_a_t*)ctx->keyschdec[r][3]) = *((u32_a_t*)ctx->keyschenc[r][3]);
+ }
+#define W (ctx->keyschdec)
+ for (r = 1; r < ctx->rounds; r++)
+ {
+ w = W[r][0];
+ *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+ ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+
+ w = W[r][1];
+ *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+ ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+
+ w = W[r][2];
+ *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+ ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+
+ w = W[r][3];
+ *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
+ ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+ }
+#undef W
+#undef w
+ }
+}
+
+
+/* Encrypt one block. A and B need to be aligned on a 4 byte
+ boundary. A and B may be the same. */
+static void
+do_encrypt_aligned (const RIJNDAEL_context *ctx,
+ unsigned char *b, const unsigned char *a)
+{
+#define rk (ctx->keyschenc)
+ int rounds = ctx->rounds;
+ int r;
+ union
+ {
+ u32 tempu32[4]; /* Force correct alignment. */
+ byte temp[4][4];
+ } u;
+
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[0][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[0][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[0][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[0][3]);
+ *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]])
+ ^ *((u32_a_t*)T2[u.temp[1][1]])
+ ^ *((u32_a_t*)T3[u.temp[2][2]])
+ ^ *((u32_a_t*)T4[u.temp[3][3]]));
+ *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]])
+ ^ *((u32_a_t*)T2[u.temp[2][1]])
+ ^ *((u32_a_t*)T3[u.temp[3][2]])
+ ^ *((u32_a_t*)T4[u.temp[0][3]]));
+ *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]])
+ ^ *((u32_a_t*)T2[u.temp[3][1]])
+ ^ *((u32_a_t*)T3[u.temp[0][2]])
+ ^ *((u32_a_t*)T4[u.temp[1][3]]));
+ *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]])
+ ^ *((u32_a_t*)T2[u.temp[0][1]])
+ ^ *((u32_a_t*)T3[u.temp[1][2]])
+ ^ *((u32_a_t*)T4[u.temp[2][3]]));
+
+ for (r = 1; r < rounds-1; r++)
+ {
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]);
+
+ *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]])
+ ^ *((u32_a_t*)T2[u.temp[1][1]])
+ ^ *((u32_a_t*)T3[u.temp[2][2]])
+ ^ *((u32_a_t*)T4[u.temp[3][3]]));
+ *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]])
+ ^ *((u32_a_t*)T2[u.temp[2][1]])
+ ^ *((u32_a_t*)T3[u.temp[3][2]])
+ ^ *((u32_a_t*)T4[u.temp[0][3]]));
+ *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]])
+ ^ *((u32_a_t*)T2[u.temp[3][1]])
+ ^ *((u32_a_t*)T3[u.temp[0][2]])
+ ^ *((u32_a_t*)T4[u.temp[1][3]]));
+ *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]])
+ ^ *((u32_a_t*)T2[u.temp[0][1]])
+ ^ *((u32_a_t*)T3[u.temp[1][2]])
+ ^ *((u32_a_t*)T4[u.temp[2][3]]));
+ }
+
+ /* Last round is special. */
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[rounds-1][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[rounds-1][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[rounds-1][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[rounds-1][3]);
+ b[ 0] = T1[u.temp[0][0]][1];
+ b[ 1] = T1[u.temp[1][1]][1];
+ b[ 2] = T1[u.temp[2][2]][1];
+ b[ 3] = T1[u.temp[3][3]][1];
+ b[ 4] = T1[u.temp[1][0]][1];
+ b[ 5] = T1[u.temp[2][1]][1];
+ b[ 6] = T1[u.temp[3][2]][1];
+ b[ 7] = T1[u.temp[0][3]][1];
+ b[ 8] = T1[u.temp[2][0]][1];
+ b[ 9] = T1[u.temp[3][1]][1];
+ b[10] = T1[u.temp[0][2]][1];
+ b[11] = T1[u.temp[1][3]][1];
+ b[12] = T1[u.temp[3][0]][1];
+ b[13] = T1[u.temp[0][1]][1];
+ b[14] = T1[u.temp[1][2]][1];
+ b[15] = T1[u.temp[2][3]][1];
+ *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[rounds][0]);
+ *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[rounds][1]);
+ *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[rounds][2]);
+ *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[rounds][3]);
+#undef rk
+}
+
+
+static void
+do_encrypt (const RIJNDAEL_context *ctx,
+ unsigned char *bx, const unsigned char *ax)
+{
+ /* BX and AX are not necessary correctly aligned. Thus we might
+ need to copy them here. We try to align to a 16 bytes. */
+ if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
+ {
+ union
+ {
+ u32 dummy[4];
+ byte a[16] ATTR_ALIGNED_16;
+ } a;
+ union
+ {
+ u32 dummy[4];
+ byte b[16] ATTR_ALIGNED_16;
+ } b;
+
+ memcpy (a.a, ax, 16);
+ do_encrypt_aligned (ctx, b.b, a.a);
+ memcpy (bx, b.b, 16);
+ }
+ else
+ {
+ do_encrypt_aligned (ctx, bx, ax);
+ }
+}
+
+
+/* Encrypt or decrypt one block using the padlock engine. A and B may
+ be the same. */
+#ifdef USE_PADLOCK
+static void
+do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag,
+ unsigned char *bx, const unsigned char *ax)
+{
+ /* BX and AX are not necessary correctly aligned. Thus we need to
+ copy them here. */
+ unsigned char a[16] __attribute__ ((aligned (16)));
+ unsigned char b[16] __attribute__ ((aligned (16)));
+ unsigned int cword[4] __attribute__ ((aligned (16)));
+
+ /* The control word fields are:
+ 127:12 11:10 9 8 7 6 5 4 3:0
+ RESERVED KSIZE CRYPT INTER KEYGN CIPHR ALIGN DGEST ROUND */
+ cword[0] = (ctx->rounds & 15); /* (The mask is just a safeguard.) */
+ cword[1] = 0;
+ cword[2] = 0;
+ cword[3] = 0;
+ if (decrypt_flag)
+ cword[0] |= 0x00000200;
+
+ memcpy (a, ax, 16);
+
+ asm volatile
+ ("pushfl\n\t" /* Force key reload. */
+ "popfl\n\t"
+ "xchg %3, %%ebx\n\t" /* Load key. */
+ "movl $1, %%ecx\n\t" /* Init counter for just one block. */
+ ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XSTORE ECB. */
+ "xchg %3, %%ebx\n" /* Restore GOT register. */
+ : /* No output */
+ : "S" (a), "D" (b), "d" (cword), "r" (ctx->padlockkey)
+ : "%ecx", "cc", "memory"
+ );
+
+ memcpy (bx, b, 16);
+
+}
+#endif /*USE_PADLOCK*/
+
+
+#ifdef USE_AESNI
+/* Encrypt one block using the Intel AES-NI instructions. A and B may
+ be the same; they need to be properly aligned to 16 bytes.
+
+ Our problem here is that gcc does not allow the "x" constraint for
+ SSE registers in asm unless you compile with -msse. The common
+ wisdom is to use a separate file for SSE instructions and build it
+ separately. This would require a lot of extra build system stuff,
+ similar to what we do in mpi/ for the asm stuff. What we do
+ instead is to use standard registers and a bit more of plain asm
+ which copies the data and key stuff to the SSE registers and later
+ back. If we decide to implement some block modes with parallelized
+ AES instructions, it might indeed be better to use plain asm ala
+ mpi/. */
+static void
+do_aesni_enc_aligned (const RIJNDAEL_context *ctx,
+ unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+ /* Note: For now we relax the alignment requirement for A and B: It
+ does not make much difference because in many case we would need
+ to memcpy them to an extra buffer; using the movdqu is much faster
+ that memcpy and movdqa. For CFB we know that the IV is properly
+ aligned but that is a special case. We should better implement
+ CFB direct in asm. */
+ asm volatile ("movdqu %[src], %%xmm0\n\t" /* xmm0 := *a */
+ "movl %[key], %%esi\n\t" /* esi := keyschenc */
+ "movdqa (%%esi), %%xmm1\n\t" /* xmm1 := key[0] */
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "movdqa 0x10(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x20(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x30(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x40(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x50(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x60(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x70(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x80(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x90(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xa0(%%esi), %%xmm1\n\t"
+ "cmp $10, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xb0(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xc0(%%esi), %%xmm1\n\t"
+ "cmp $12, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xd0(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xe0(%%esi), %%xmm1\n"
+
+ ".Lenclast%=:\n\t"
+ aesenclast_xmm1_xmm0
+ "movdqu %%xmm0, %[dst]\n"
+ : [dst] "=m" (*b)
+ : [src] "m" (*a),
+ [key] "r" (ctx->keyschenc),
+ [rounds] "r" (ctx->rounds)
+ : "%esi", "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+static void
+do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
+ unsigned char *b, const unsigned char *a)
+{
+#define aesdec_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
+#define aesdeclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
+ asm volatile ("movdqu %[src], %%xmm0\n\t" /* xmm0 := *a */
+ "movl %[key], %%esi\n\t"
+ "movdqa (%%esi), %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "movdqa 0x10(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x20(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x30(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x40(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x50(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x60(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x70(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x80(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0x90(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xa0(%%esi), %%xmm1\n\t"
+ "cmp $10, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xb0(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xc0(%%esi), %%xmm1\n\t"
+ "cmp $12, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xd0(%%esi), %%xmm1\n\t"
+ aesdec_xmm1_xmm0
+ "movdqa 0xe0(%%esi), %%xmm1\n"
+
+ ".Ldeclast%=:\n\t"
+ aesdeclast_xmm1_xmm0
+ "movdqu %%xmm0, %[dst]\n"
+ : [dst] "=m" (*b)
+ : [src] "m" (*a),
+ [key] "r" (ctx->keyschdec),
+ [rounds] "r" (ctx->rounds)
+ : "%esi", "cc", "memory");
+#undef aesdec_xmm1_xmm0
+#undef aesdeclast_xmm1_xmm0
+}
+
+
+/* Perform a CFB encryption or decryption round using the
+ initialization vector IV and the input block A. Write the result
+ to the output block B and update IV. IV needs to be 16 byte
+ aligned. */
+static void
+do_aesni_cfb (const RIJNDAEL_context *ctx, int decrypt_flag,
+ unsigned char *iv, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+ asm volatile ("movdqa %[iv], %%xmm0\n\t" /* xmm0 := IV */
+ "movl %[key], %%esi\n\t" /* esi := keyschenc */
+ "movdqa (%%esi), %%xmm1\n\t" /* xmm1 := key[0] */
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "movdqa 0x10(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x20(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x30(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x40(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x50(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x60(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x70(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x80(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x90(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xa0(%%esi), %%xmm1\n\t"
+ "cmp $10, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xb0(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xc0(%%esi), %%xmm1\n\t"
+ "cmp $12, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xd0(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xe0(%%esi), %%xmm1\n"
+
+ ".Lenclast%=:\n\t"
+ aesenclast_xmm1_xmm0
+ "movdqu %[src], %%xmm1\n\t" /* Save input. */
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 = input ^ IV */
+
+ "cmp $1, %[decrypt]\n\t"
+ "jz .Ldecrypt_%=\n\t"
+ "movdqa %%xmm0, %[iv]\n\t" /* [encrypt] Store IV. */
+ "jmp .Lleave_%=\n"
+ ".Ldecrypt_%=:\n\t"
+ "movdqa %%xmm1, %[iv]\n" /* [decrypt] Store IV. */
+ ".Lleave_%=:\n\t"
+ "movdqu %%xmm0, %[dst]\n" /* Store output. */
+ : [iv] "+m" (*iv), [dst] "=m" (*b)
+ : [src] "m" (*a),
+ [key] "g" (ctx->keyschenc),
+ [rounds] "g" (ctx->rounds),
+ [decrypt] "m" (decrypt_flag)
+ : "%esi", "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+/* Perform a CTR encryption round using the counter CTR and the input
+ block A. Write the result to the output block B and update CTR.
+ CTR needs to be a 16 byte aligned little-endian value. */
+static void
+do_aesni_ctr (const RIJNDAEL_context *ctx,
+ unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+ static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
+ asm volatile ("movdqa %[ctr], %%xmm0\n\t" /* xmm0, xmm2 := CTR */
+ "movaps %%xmm0, %%xmm2\n\t"
+ "mov $1, %%esi\n\t" /* xmm2++ (big-endian) */
+ "movd %%esi, %%xmm1\n\t"
+ "pshufb %[mask], %%xmm2\n\t"
+ "paddq %%xmm1, %%xmm2\n\t"
+ "pshufb %[mask], %%xmm2\n\t"
+ "movdqa %%xmm2, %[ctr]\n" /* Update CTR. */
+
+ "movl %[key], %%esi\n\t" /* esi := keyschenc */
+ "movdqa (%%esi), %%xmm1\n\t" /* xmm1 := key[0] */
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "movdqa 0x10(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x20(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x30(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x40(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x50(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x60(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x70(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x80(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0x90(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xa0(%%esi), %%xmm1\n\t"
+ "cmp $10, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xb0(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xc0(%%esi), %%xmm1\n\t"
+ "cmp $12, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xd0(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ "movdqa 0xe0(%%esi), %%xmm1\n"
+
+ ".Lenclast%=:\n\t"
+ aesenclast_xmm1_xmm0
+ "movdqu %[src], %%xmm1\n\t" /* xmm1 := input */
+ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */
+ "movdqu %%xmm0, %[dst]" /* Store EncCTR. */
+
+ : [ctr] "+m" (*ctr), [dst] "=m" (*b)
+ : [src] "m" (*a),
+ [key] "g" (ctx->keyschenc),
+ [rounds] "g" (ctx->rounds),
+ [mask] "m" (*be_mask)
+ : "%esi", "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+/* Four blocks at a time variant of do_aesni_ctr. */
+static void
+do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
+ unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenc_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t"
+#define aesenc_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t"
+#define aesenc_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t"
+#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+#define aesenclast_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t"
+#define aesenclast_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t"
+#define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
+
+ static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
+ /* Register usage:
+ esi keyschedule
+ xmm0 CTR-0
+ xmm1 temp / round key
+ xmm2 CTR-1
+ xmm3 CTR-2
+ xmm4 CTR-3
+ xmm5 temp
+ */
+
+ asm volatile ("movdqa %[ctr], %%xmm0\n\t" /* xmm0, xmm2 := CTR */
+ "movaps %%xmm0, %%xmm2\n\t"
+ "mov $1, %%esi\n\t" /* xmm1 := 1 */
+ "movd %%esi, %%xmm1\n\t"
+ "pshufb %[mask], %%xmm2\n\t" /* xmm2 := le(xmm2) */
+ "paddq %%xmm1, %%xmm2\n\t" /* xmm2++ */
+ "movaps %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
+ "paddq %%xmm1, %%xmm3\n\t" /* xmm3++ */
+ "movaps %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */
+ "paddq %%xmm1, %%xmm4\n\t" /* xmm4++ */
+ "movaps %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */
+ "paddq %%xmm1, %%xmm5\n\t" /* xmm5++ */
+ "pshufb %[mask], %%xmm2\n\t" /* xmm2 := be(xmm2) */
+ "pshufb %[mask], %%xmm3\n\t" /* xmm3 := be(xmm3) */
+ "pshufb %[mask], %%xmm4\n\t" /* xmm4 := be(xmm4) */
+ "pshufb %[mask], %%xmm5\n\t" /* xmm5 := be(xmm5) */
+ "movdqa %%xmm5, %[ctr]\n" /* Update CTR. */
+
+ "movl %[key], %%esi\n\t" /* esi := keyschenc */
+ "movdqa (%%esi), %%xmm1\n\t" /* xmm1 := key[0] */
+ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "movdqa 0x10(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x20(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x30(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x40(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x50(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x60(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x70(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x80(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0x90(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xa0(%%esi), %%xmm1\n\t"
+ "cmp $10, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xb0(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xc0(%%esi), %%xmm1\n\t"
+ "cmp $12, %[rounds]\n\t"
+ "jz .Lenclast%=\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xd0(%%esi), %%xmm1\n\t"
+ aesenc_xmm1_xmm0
+ aesenc_xmm1_xmm2
+ aesenc_xmm1_xmm3
+ aesenc_xmm1_xmm4
+ "movdqa 0xe0(%%esi), %%xmm1\n"
+
+ ".Lenclast%=:\n\t"
+ aesenclast_xmm1_xmm0
+ aesenclast_xmm1_xmm2
+ aesenclast_xmm1_xmm3
+ aesenclast_xmm1_xmm4
+
+ "movdqu %[src], %%xmm1\n\t" /* Get block 1. */
+ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR-1 ^= input */
+ "movdqu %%xmm0, %[dst]\n\t" /* Store block 1 */
+
+ "movdqu (16)%[src], %%xmm1\n\t" /* Get block 2. */
+ "pxor %%xmm1, %%xmm2\n\t" /* EncCTR-2 ^= input */
+ "movdqu %%xmm2, (16)%[dst]\n\t" /* Store block 2. */
+
+ "movdqu (32)%[src], %%xmm1\n\t" /* Get block 3. */
+ "pxor %%xmm1, %%xmm3\n\t" /* EncCTR-3 ^= input */
+ "movdqu %%xmm3, (32)%[dst]\n\t" /* Store block 3. */
+
+ "movdqu (48)%[src], %%xmm1\n\t" /* Get block 4. */
+ "pxor %%xmm1, %%xmm4\n\t" /* EncCTR-4 ^= input */
+ "movdqu %%xmm4, (48)%[dst]" /* Store block 4. */
+
+ : [ctr] "+m" (*ctr), [dst] "=m" (*b)
+ : [src] "m" (*a),
+ [key] "g" (ctx->keyschenc),
+ [rounds] "g" (ctx->rounds),
+ [mask] "m" (*be_mask)
+ : "%esi", "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenc_xmm1_xmm2
+#undef aesenc_xmm1_xmm3
+#undef aesenc_xmm1_xmm4
+#undef aesenclast_xmm1_xmm0
+#undef aesenclast_xmm1_xmm2
+#undef aesenclast_xmm1_xmm3
+#undef aesenclast_xmm1_xmm4
+}
+
+
+static void
+do_aesni (RIJNDAEL_context *ctx, int decrypt_flag,
+ unsigned char *bx, const unsigned char *ax)
+{
+
+ if (decrypt_flag)
+ {
+ if (!ctx->decryption_prepared )
+ {
+ prepare_decryption ( ctx );
+ ctx->decryption_prepared = 1;
+ }
+ do_aesni_dec_aligned (ctx, bx, ax);
+ }
+ else
+ do_aesni_enc_aligned (ctx, bx, ax);
+}
+#endif /*USE_AESNI*/
+
+
+static void
+rijndael_encrypt (void *context, byte *b, const byte *a)
+{
+ RIJNDAEL_context *ctx = context;
+
+ if (0)
+ ;
+#ifdef USE_PADLOCK
+ else if (ctx->use_padlock)
+ {
+ do_padlock (ctx, 0, b, a);
+ _gcry_burn_stack (48 + 15 /* possible padding for alignment */);
+ }
+#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ {
+ aesni_prepare ();
+ do_aesni (ctx, 0, b, a);
+ aesni_cleanup ();
+ }
+#endif /*USE_AESNI*/
+ else
+ {
+ do_encrypt (ctx, b, a);
+ _gcry_burn_stack (56 + 2*sizeof(int));
+ }
+}
+
+
+/* Bulk encryption of complete blocks in CFB mode. Caller needs to
+ make sure that IV is aligned on an unsigned long boundary. This
+ function is only intended for the bulk encryption feature of
+ cipher.c. */
+
+
+/* Bulk encryption of complete blocks in CBC mode. Caller needs to
+ make sure that IV is aligned on an unsigned long boundary. This
+ function is only intended for the bulk encryption feature of
+ cipher.c. */
+
+
+/* Bulk encryption of complete blocks in CTR mode. Caller needs to
+ make sure that CTR is aligned on a 16 byte boundary if AESNI; the
+ minimum alignment is for an u32. This function is only intended
+ for the bulk encryption feature of cipher.c. CTR is expected to be
+ of size BLOCKSIZE. */
+
+
+
+/* Decrypt one block. A and B need to be aligned on a 4 byte boundary
+ and the decryption must have been prepared. A and B may be the
+ same. */
+static void
+do_decrypt_aligned (RIJNDAEL_context *ctx,
+ unsigned char *b, const unsigned char *a)
+{
+#define rk (ctx->keyschdec)
+ int rounds = ctx->rounds;
+ int r;
+ union
+ {
+ u32 tempu32[4]; /* Force correct alignment. */
+ byte temp[4][4];
+ } u;
+
+
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[rounds][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[rounds][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[rounds][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[rounds][3]);
+
+ *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]])
+ ^ *((u32_a_t*)T6[u.temp[3][1]])
+ ^ *((u32_a_t*)T7[u.temp[2][2]])
+ ^ *((u32_a_t*)T8[u.temp[1][3]]));
+ *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]])
+ ^ *((u32_a_t*)T6[u.temp[0][1]])
+ ^ *((u32_a_t*)T7[u.temp[3][2]])
+ ^ *((u32_a_t*)T8[u.temp[2][3]]));
+ *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]])
+ ^ *((u32_a_t*)T6[u.temp[1][1]])
+ ^ *((u32_a_t*)T7[u.temp[0][2]])
+ ^ *((u32_a_t*)T8[u.temp[3][3]]));
+ *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]])
+ ^ *((u32_a_t*)T6[u.temp[2][1]])
+ ^ *((u32_a_t*)T7[u.temp[1][2]])
+ ^ *((u32_a_t*)T8[u.temp[0][3]]));
+
+ for (r = rounds-1; r > 1; r--)
+ {
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]);
+ *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]])
+ ^ *((u32_a_t*)T6[u.temp[3][1]])
+ ^ *((u32_a_t*)T7[u.temp[2][2]])
+ ^ *((u32_a_t*)T8[u.temp[1][3]]));
+ *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]])
+ ^ *((u32_a_t*)T6[u.temp[0][1]])
+ ^ *((u32_a_t*)T7[u.temp[3][2]])
+ ^ *((u32_a_t*)T8[u.temp[2][3]]));
+ *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]])
+ ^ *((u32_a_t*)T6[u.temp[1][1]])
+ ^ *((u32_a_t*)T7[u.temp[0][2]])
+ ^ *((u32_a_t*)T8[u.temp[3][3]]));
+ *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]])
+ ^ *((u32_a_t*)T6[u.temp[2][1]])
+ ^ *((u32_a_t*)T7[u.temp[1][2]])
+ ^ *((u32_a_t*)T8[u.temp[0][3]]));
+ }
+
+ /* Last round is special. */
+ *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[1][0]);
+ *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[1][1]);
+ *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[1][2]);
+ *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[1][3]);
+ b[ 0] = S5[u.temp[0][0]];
+ b[ 1] = S5[u.temp[3][1]];
+ b[ 2] = S5[u.temp[2][2]];
+ b[ 3] = S5[u.temp[1][3]];
+ b[ 4] = S5[u.temp[1][0]];
+ b[ 5] = S5[u.temp[0][1]];
+ b[ 6] = S5[u.temp[3][2]];
+ b[ 7] = S5[u.temp[2][3]];
+ b[ 8] = S5[u.temp[2][0]];
+ b[ 9] = S5[u.temp[1][1]];
+ b[10] = S5[u.temp[0][2]];
+ b[11] = S5[u.temp[3][3]];
+ b[12] = S5[u.temp[3][0]];
+ b[13] = S5[u.temp[2][1]];
+ b[14] = S5[u.temp[1][2]];
+ b[15] = S5[u.temp[0][3]];
+ *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[0][0]);
+ *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[0][1]);
+ *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[0][2]);
+ *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[0][3]);
+#undef rk
+}
+
+
+/* Decrypt one block. AX and BX may be the same. */
+static void
+do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax)
+{
+ if ( !ctx->decryption_prepared )
+ {
+ prepare_decryption ( ctx );
+ _gcry_burn_stack (64);
+ ctx->decryption_prepared = 1;
+ }
+
+ /* BX and AX are not necessary correctly aligned. Thus we might
+ need to copy them here. We try to align to a 16 bytes. */
+ if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
+ {
+ union
+ {
+ u32 dummy[4];
+ byte a[16] ATTR_ALIGNED_16;
+ } a;
+ union
+ {
+ u32 dummy[4];
+ byte b[16] ATTR_ALIGNED_16;
+ } b;
+
+ memcpy (a.a, ax, 16);
+ do_decrypt_aligned (ctx, b.b, a.a);
+ memcpy (bx, b.b, 16);
+ }
+ else
+ {
+ do_decrypt_aligned (ctx, bx, ax);
+ }
+}
+
+
+
+
+static void
+rijndael_decrypt (void *context, byte *b, const byte *a)
+{
+ RIJNDAEL_context *ctx = context;
+
+ if (0)
+ ;
+#ifdef USE_PADLOCK
+ else if (ctx->use_padlock)
+ {
+ do_padlock (ctx, 1, b, a);
+ _gcry_burn_stack (48 + 2*sizeof(int) /* FIXME */);
+ }
+#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ {
+ aesni_prepare ();
+ do_aesni (ctx, 1, b, a);
+ aesni_cleanup ();
+ }
+#endif /*USE_AESNI*/
+ else
+ {
+ do_decrypt (ctx, b, a);
+ _gcry_burn_stack (56+2*sizeof(int));
+ }
+}
+
+
+/* Bulk decryption of complete blocks in CFB mode. Caller needs to
+ make sure that IV is aligned on an unisgned lonhg boundary. This
+ function is only intended for the bulk encryption feature of
+ cipher.c. */
+
+
+/* Bulk decryption of complete blocks in CBC mode. Caller needs to
+ make sure that IV is aligned on an unsigned long boundary. This
+ function is only intended for the bulk encryption feature of
+ cipher.c. */
+
+
+
+
+/* Run the self-tests for AES 128. Returns NULL on success. */
+
+/* Run the self-tests for AES 192. Returns NULL on success. */
+
+
+/* Run the self-tests for AES 256. Returns NULL on success. */
+
+/* Run all the self-tests and return NULL on success. This function
+ is used for the on-the-fly self-tests. */
+
+
+/* SP800-38a.pdf for AES-128. */
+
+
+/* Complete selftest for AES-128 with all modes and driver code. */
+
+/* Complete selftest for AES-192. */
+
+
+/* Complete selftest for AES-256. */
+
+
+
+/* Run a full self-test for ALGO and return 0 on success. */
+
+
+
+
+static const char *rijndael_names[] =
+ {
+ "RIJNDAEL",
+ "AES128",
+ "AES-128",
+ NULL
+ };
+
+static gcry_cipher_oid_spec_t rijndael_oids[] =
+ {
+ { "2.16.840.1.101.3.4.1.1", GCRY_CIPHER_MODE_ECB },
+ { "2.16.840.1.101.3.4.1.2", GCRY_CIPHER_MODE_CBC },
+ { "2.16.840.1.101.3.4.1.3", GCRY_CIPHER_MODE_OFB },
+ { "2.16.840.1.101.3.4.1.4", GCRY_CIPHER_MODE_CFB },
+ { NULL }
+ };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aes =
+ {
+ "AES", rijndael_names, rijndael_oids, 16, 128, sizeof (RIJNDAEL_context),
+ rijndael_setkey, rijndael_encrypt, rijndael_decrypt
+ ,
+#ifdef GRUB_UTIL
+ .modname = "gcry_rijndael",
+#endif
+ };
+
+static const char *rijndael192_names[] =
+ {
+ "RIJNDAEL192",
+ "AES-192",
+ NULL
+ };
+
+static gcry_cipher_oid_spec_t rijndael192_oids[] =
+ {
+ { "2.16.840.1.101.3.4.1.21", GCRY_CIPHER_MODE_ECB },
+ { "2.16.840.1.101.3.4.1.22", GCRY_CIPHER_MODE_CBC },
+ { "2.16.840.1.101.3.4.1.23", GCRY_CIPHER_MODE_OFB },
+ { "2.16.840.1.101.3.4.1.24", GCRY_CIPHER_MODE_CFB },
+ { NULL }
+ };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aes192 =
+ {
+ "AES192", rijndael192_names, rijndael192_oids, 16, 192, sizeof (RIJNDAEL_context),
+ rijndael_setkey, rijndael_encrypt, rijndael_decrypt
+ ,
+#ifdef GRUB_UTIL
+ .modname = "gcry_rijndael",
+#endif
+ };
+
+static const char *rijndael256_names[] =
+ {
+ "RIJNDAEL256",
+ "AES-256",
+ NULL
+ };
+
+static gcry_cipher_oid_spec_t rijndael256_oids[] =
+ {
+ { "2.16.840.1.101.3.4.1.41", GCRY_CIPHER_MODE_ECB },
+ { "2.16.840.1.101.3.4.1.42", GCRY_CIPHER_MODE_CBC },
+ { "2.16.840.1.101.3.4.1.43", GCRY_CIPHER_MODE_OFB },
+ { "2.16.840.1.101.3.4.1.44", GCRY_CIPHER_MODE_CFB },
+ { NULL }
+ };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aes256 =
+ {
+ "AES256", rijndael256_names, rijndael256_oids, 16, 256,
+ sizeof (RIJNDAEL_context),
+ rijndael_setkey, rijndael_encrypt, rijndael_decrypt
+ ,
+#ifdef GRUB_UTIL
+ .modname = "gcry_rijndael",
+#endif
+ };
+
+
+
+GRUB_MOD_INIT(gcry_rijndael)
+{
+ grub_cipher_register (&_gcry_cipher_spec_aes);
+ grub_cipher_register (&_gcry_cipher_spec_aes192);
+ grub_cipher_register (&_gcry_cipher_spec_aes256);
+}
+
+GRUB_MOD_FINI(gcry_rijndael)
+{
+ grub_cipher_unregister (&_gcry_cipher_spec_aes);
+ grub_cipher_unregister (&_gcry_cipher_spec_aes192);
+ grub_cipher_unregister (&_gcry_cipher_spec_aes256);
+}