5 files changed, 503 insertions, 0 deletions
diff --git a/comm/third_party/botan/src/lib/block/idea/idea.cpp b/comm/third_party/botan/src/lib/block/idea/idea.cpp
new file mode 100644
index 0000000000..f8f5ceb348
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/idea/idea.cpp
@@ -0,0 +1,240 @@
+/*
+* IDEA
+* (C) 1999-2010,2015 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/idea.h>
+#include <botan/loadstor.h>
+#include <botan/cpuid.h>
+#include <botan/internal/ct_utils.h>
+
+namespace Botan {
+
+namespace {
+
+/*
+* Multiplication modulo 65537
+*/
+inline uint16_t mul(uint16_t x, uint16_t y)
+   {
+   const uint32_t P = static_cast<uint32_t>(x) * y;
+   const auto P_mask = CT::Mask<uint16_t>(CT::Mask<uint32_t>::is_zero(P));
+
+   const uint32_t P_hi = P >> 16;
+   const uint32_t P_lo = P & 0xFFFF;
+
+   const uint16_t carry = (P_lo < P_hi);
+   const uint16_t r_1 = static_cast<uint16_t>((P_lo - P_hi) + carry);
+   const uint16_t r_2 = 1 - x - y;
+
+   return P_mask.select(r_2, r_1);
+   }
+
+/*
+* Find multiplicative inverses modulo 65537
+*
+* 65537 is prime; thus Fermat's little theorem tells us that
+* x^65537 == x modulo 65537, which means
+* x^(65537-2) == x^-1 modulo 65537 since
+* x^(65537-2) * x == 1 mod 65537
+*
+* Do the exponentiation with a basic square and multiply: all bits are
+* of exponent are 1 so we always multiply
+*/
+uint16_t mul_inv(uint16_t x)
+   {
+   uint16_t y = x;
+
+   for(size_t i = 0; i != 15; ++i)
+      {
+      y = mul(y, y); // square
+      y = mul(y, x);
+      }
+
+   return y;
+   }
+
+/**
+* IDEA is involutional, depending only on the key schedule
+*/
+void idea_op(const uint8_t in[], uint8_t out[], size_t blocks, const uint16_t K[52])
+   {
+   const size_t BLOCK_SIZE = 8;
+
+   CT::poison(in, blocks * 8);
+   CT::poison(out, blocks * 8);
+   CT::poison(K, 52);
+
+   BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
+      {
+      uint16_t X1, X2, X3, X4;
+      load_be(in + BLOCK_SIZE*i, X1, X2, X3, X4);
+
+      for(size_t j = 0; j != 8; ++j)
+         {
+         X1 = mul(X1, K[6*j+0]);
+         X2 += K[6*j+1];
+         X3 += K[6*j+2];
+         X4 = mul(X4, K[6*j+3]);
+
+         const uint16_t T0 = X3;
+         X3 = mul(X3 ^ X1, K[6*j+4]);
+
+         const uint16_t T1 = X2;
+         X2 = mul((X2 ^ X4) + X3, K[6*j+5]);
+         X3 += X2;
+
+         X1 ^= X2;
+         X4 ^= X3;
+         X2 ^= T0;
+         X3 ^= T1;
+         }
+
+      X1  = mul(X1, K[48]);
+      X2 += K[50];
+      X3 += K[49];
+      X4  = mul(X4, K[51]);
+
+      store_be(out + BLOCK_SIZE*i, X1, X3, X2, X4);
+      }
+
+   CT::unpoison(in, blocks * 8);
+   CT::unpoison(out, blocks * 8);
+   CT::unpoison(K, 52);
+   }
+
+}
+
+size_t IDEA::parallelism() const
+   {
+#if defined(BOTAN_HAS_IDEA_SSE2)
+   if(CPUID::has_sse2())
+      {
+      return 8;
+      }
+#endif
+
+   return 1;
+   }
+
+std::string IDEA::provider() const
+   {
+#if defined(BOTAN_HAS_IDEA_SSE2)
+   if(CPUID::has_sse2())
+      {
+      return "sse2";
+      }
+#endif
+
+   return "base";
+   }
+
+/*
+* IDEA Encryption
+*/
+void IDEA::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+   {
+   verify_key_set(m_EK.empty() == false);
+
+#if defined(BOTAN_HAS_IDEA_SSE2)
+   if(CPUID::has_sse2())
+      {
+      while(blocks >= 8)
+         {
+         sse2_idea_op_8(in, out, m_EK.data());
+         in += 8 * BLOCK_SIZE;
+         out += 8 * BLOCK_SIZE;
+         blocks -= 8;
+         }
+      }
+#endif
+
+   idea_op(in, out, blocks, m_EK.data());
+   }
+
+/*
+* IDEA Decryption
+*/
+void IDEA::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
+   {
+   verify_key_set(m_DK.empty() == false);
+
+#if defined(BOTAN_HAS_IDEA_SSE2)
+   if(CPUID::has_sse2())
+      {
+      while(blocks >= 8)
+         {
+         sse2_idea_op_8(in, out, m_DK.data());
+         in += 8 * BLOCK_SIZE;
+         out += 8 * BLOCK_SIZE;
+         blocks -= 8;
+         }
+      }
+#endif
+
+   idea_op(in, out, blocks, m_DK.data());
+   }
+
+/*
+* IDEA Key Schedule
+*/
+void IDEA::key_schedule(const uint8_t key[], size_t)
+   {
+   m_EK.resize(52);
+   m_DK.resize(52);
+
+   CT::poison(key, 16);
+   CT::poison(m_EK.data(), 52);
+   CT::poison(m_DK.data(), 52);
+
+   secure_vector<uint64_t> K(2);
+
+   K[0] = load_be<uint64_t>(key, 0);
+   K[1] = load_be<uint64_t>(key, 1);
+
+   for(size_t off = 0; off != 48; off += 8)
+      {
+      for(size_t i = 0; i != 8; ++i)
+         m_EK[off+i] = static_cast<uint16_t>(K[i/4] >> (48-16*(i % 4)));
+
+      const uint64_t Kx = (K[0] >> 39);
+      const uint64_t Ky = (K[1] >> 39);
+
+      K[0] = (K[0] << 25) | Ky;
+      K[1] = (K[1] << 25) | Kx;
+      }
+
+   for(size_t i = 0; i != 4; ++i)
+      m_EK[48+i] = static_cast<uint16_t>(K[i/4] >> (48-16*(i % 4)));
+
+   m_DK[0] = mul_inv(m_EK[48]);
+   m_DK[1] = -m_EK[49];
+   m_DK[2] = -m_EK[50];
+   m_DK[3] = mul_inv(m_EK[51]);
+
+   for(size_t i = 0; i != 8*6; i += 6)
+      {
+      m_DK[i+4] = m_EK[46-i];
+      m_DK[i+5] = m_EK[47-i];
+      m_DK[i+6] = mul_inv(m_EK[42-i]);
+      m_DK[i+7] = -m_EK[44-i];
+      m_DK[i+8] = -m_EK[43-i];
+      m_DK[i+9] = mul_inv(m_EK[45-i]);
+      }
+
+   std::swap(m_DK[49], m_DK[50]);
+
+   CT::unpoison(key, 16);
+   CT::unpoison(m_EK.data(), 52);
+   CT::unpoison(m_DK.data(), 52);
+   }
+
+void IDEA::clear()
+   {
+   zap(m_EK);
+   zap(m_DK);
+   }
+
+}
diff --git a/comm/third_party/botan/src/lib/block/idea/idea.h b/comm/third_party/botan/src/lib/block/idea/idea.h
new file mode 100644
index 0000000000..e5e51606b9
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/idea/idea.h
@@ -0,0 +1,45 @@
+/*
+* IDEA
+* (C) 1999-2007 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#ifndef BOTAN_IDEA_H_
+#define BOTAN_IDEA_H_
+
+#include <botan/block_cipher.h>
+
+BOTAN_FUTURE_INTERNAL_HEADER(idea.h)
+
+namespace Botan {
+
+/**
+* IDEA
+*/
+class BOTAN_PUBLIC_API(2,0) IDEA final : public Block_Cipher_Fixed_Params<8, 16>
+   {
+   public:
+      void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override;
+      void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override;
+
+      void clear() override;
+
+      std::string provider() const override;
+      std::string name() const override { return "IDEA"; }
+      BlockCipher* clone() const override { return new IDEA; }
+      size_t parallelism() const override;
+
+   private:
+#if defined(BOTAN_HAS_IDEA_SSE2)
+      void sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52]) const;
+#endif
+
+      void key_schedule(const uint8_t[], size_t) override;
+
+      secure_vector<uint16_t> m_EK, m_DK;
+   };
+
+}
+
+#endif
diff --git a/comm/third_party/botan/src/lib/block/idea/idea_sse2/idea_sse2.cpp b/comm/third_party/botan/src/lib/block/idea/idea_sse2/idea_sse2.cpp
new file mode 100644
index 0000000000..93648cfc7a
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/idea/idea_sse2/idea_sse2.cpp
@@ -0,0 +1,208 @@
+/*
+* IDEA in SSE2
+* (C) 2009 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include <botan/idea.h>
+#include <botan/internal/ct_utils.h>
+#include <emmintrin.h>
+
+namespace Botan {
+
+namespace {
+
+BOTAN_FUNC_ISA("sse2")
+inline __m128i mul(__m128i X, uint16_t K_16)
+   {
+   const __m128i zeros = _mm_set1_epi16(0);
+   const __m128i ones = _mm_set1_epi16(1);
+
+   const __m128i K = _mm_set1_epi16(K_16);
+
+   const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
+   const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
+
+   const __m128i mul_lo = _mm_mullo_epi16(X, K);
+   const __m128i mul_hi = _mm_mulhi_epu16(X, K);
+
+   __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
+
+   // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
+   const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
+   const __m128i cmp = _mm_min_epu8(
+     _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
+
+   T = _mm_add_epi16(T, cmp);
+
+   /* Selection: if X[i] is zero then assign 1-K
+                 if K is zero then assign 1-X[i]
+
+      Could if() off value of K_16 for the second, but this gives a
+      constant time implementation which is a nice bonus.
+   */
+
+   T = _mm_or_si128(
+      _mm_andnot_si128(X_is_zero, T),
+      _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
+
+   T = _mm_or_si128(
+      _mm_andnot_si128(K_is_zero, T),
+      _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
+
+   return T;
+   }
+
+/*
+* 4x8 matrix transpose
+*
+* FIXME: why do I need the extra set of unpack_epi32 here? Inverse in
+* transpose_out doesn't need it. Something with the shuffle? Removing
+* that extra unpack could easily save 3-4 cycles per block, and would
+* also help a lot with register pressure on 32-bit x86
+*/
+BOTAN_FUNC_ISA("sse2")
+void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
+   {
+   __m128i T0 = _mm_unpackhi_epi32(B0, B1);
+   __m128i T1 = _mm_unpacklo_epi32(B0, B1);
+   __m128i T2 = _mm_unpackhi_epi32(B2, B3);
+   __m128i T3 = _mm_unpacklo_epi32(B2, B3);
+
+   __m128i T4 = _mm_unpacklo_epi32(T0, T1);
+   __m128i T5 = _mm_unpackhi_epi32(T0, T1);
+   __m128i T6 = _mm_unpacklo_epi32(T2, T3);
+   __m128i T7 = _mm_unpackhi_epi32(T2, T3);
+
+   T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
+   T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
+   T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
+   T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
+
+   T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
+   T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
+   T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
+   T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
+
+   T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
+   T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
+   T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
+   T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
+
+   B0 = _mm_unpacklo_epi64(T0, T2);
+   B1 = _mm_unpackhi_epi64(T0, T2);
+   B2 = _mm_unpacklo_epi64(T1, T3);
+   B3 = _mm_unpackhi_epi64(T1, T3);
+   }
+
+/*
+* 4x8 matrix transpose (reverse)
+*/
+BOTAN_FUNC_ISA("sse2")
+void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
+   {
+   __m128i T0 = _mm_unpacklo_epi64(B0, B1);
+   __m128i T1 = _mm_unpacklo_epi64(B2, B3);
+   __m128i T2 = _mm_unpackhi_epi64(B0, B1);
+   __m128i T3 = _mm_unpackhi_epi64(B2, B3);
+
+   T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
+   T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
+   T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
+   T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
+
+   T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
+   T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
+   T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
+   T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
+
+   T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
+   T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
+   T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
+   T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
+
+   B0 = _mm_unpacklo_epi32(T0, T1);
+   B1 = _mm_unpackhi_epi32(T0, T1);
+   B2 = _mm_unpacklo_epi32(T2, T3);
+   B3 = _mm_unpackhi_epi32(T2, T3);
+   }
+
+}
+
+/*
+* 8 wide IDEA encryption/decryption in SSE2
+*/
+BOTAN_FUNC_ISA("sse2")
+void IDEA::sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52]) const
+   {
+   CT::poison(in, 64);
+   CT::poison(out, 64);
+   CT::poison(EK, 52);
+
+   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
+
+   __m128i B0 = _mm_loadu_si128(in_mm + 0);
+   __m128i B1 = _mm_loadu_si128(in_mm + 1);
+   __m128i B2 = _mm_loadu_si128(in_mm + 2);
+   __m128i B3 = _mm_loadu_si128(in_mm + 3);
+
+   transpose_in(B0, B1, B2, B3);
+
+   // byte swap
+   B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
+   B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
+   B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
+   B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
+
+   for(size_t i = 0; i != 8; ++i)
+      {
+      B0 = mul(B0, EK[6*i+0]);
+      B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
+      B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
+      B3 = mul(B3, EK[6*i+3]);
+
+      __m128i T0 = B2;
+      B2 = _mm_xor_si128(B2, B0);
+      B2 = mul(B2, EK[6*i+4]);
+
+      __m128i T1 = B1;
+
+      B1 = _mm_xor_si128(B1, B3);
+      B1 = _mm_add_epi16(B1, B2);
+      B1 = mul(B1, EK[6*i+5]);
+
+      B2 = _mm_add_epi16(B2, B1);
+
+      B0 = _mm_xor_si128(B0, B1);
+      B1 = _mm_xor_si128(B1, T0);
+      B3 = _mm_xor_si128(B3, B2);
+      B2 = _mm_xor_si128(B2, T1);
+      }
+
+   B0 = mul(B0, EK[48]);
+   B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
+   B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
+   B3 = mul(B3, EK[51]);
+
+   // byte swap
+   B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
+   B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
+   B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
+   B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
+
+   transpose_out(B0, B2, B1, B3);
+
+   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
+
+   _mm_storeu_si128(out_mm + 0, B0);
+   _mm_storeu_si128(out_mm + 1, B2);
+   _mm_storeu_si128(out_mm + 2, B1);
+   _mm_storeu_si128(out_mm + 3, B3);
+
+   CT::unpoison(in, 64);
+   CT::unpoison(out, 64);
+   CT::unpoison(EK, 52);
+   }
+
+}
diff --git a/comm/third_party/botan/src/lib/block/idea/idea_sse2/info.txt b/comm/third_party/botan/src/lib/block/idea/idea_sse2/info.txt
new file mode 100644
index 0000000000..b0ca2d02fa
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/idea/idea_sse2/info.txt
@@ -0,0 +1,7 @@
+<defines>
+IDEA_SSE2 -> 20131128
+</defines>
+
+<isa>
+sse2
+</isa>
diff --git a/comm/third_party/botan/src/lib/block/idea/info.txt b/comm/third_party/botan/src/lib/block/idea/info.txt
new file mode 100644
index 0000000000..bcbdce03f1
--- /dev/null
+++ b/comm/third_party/botan/src/lib/block/idea/info.txt
@@ -0,0 +1,3 @@
+<defines>
+IDEA -> 20131128
+</defines>