1 files changed, 464 insertions, 0 deletions
diff --git a/debian/patches/libiberty-sha1-1.diff b/debian/patches/libiberty-sha1-1.diff
new file mode 100644
index 0000000..dbdb710
--- /dev/null
+++ b/debian/patches/libiberty-sha1-1.diff
@@ -0,0 +1,464 @@
+# DP: libiberty: Use x86 HW optimized sha1
+
+--- a/src/include/sha1.h
++++ b/src/include/sha1.h
+@@ -108,6 +108,13 @@ extern void sha1_process_block (const vo
+ extern void sha1_process_bytes (const void *buffer, size_t len,
+ 				struct sha1_ctx *ctx);
+ 
++typedef void (*sha1_process_bytes_fn) (const void *, size_t,
++				       struct sha1_ctx *);
++
++/* Return sha1_process_bytes or some hardware optimized version thereof
++   depending on current CPU.  */
++extern sha1_process_bytes_fn sha1_choose_process_bytes (void);
++
+ /* Process the remaining bytes in the buffer and put result from CTX
+    in first 20 bytes following RESBUF.  The result is always in little
+    endian byte order, so that a byte-wise output yields to the wanted
+--- a/src/libiberty/config.in
++++ b/src/libiberty/config.in
+@@ -432,6 +432,9 @@
+ /* Define to 1 if `vfork' works. */
+ #undef HAVE_WORKING_VFORK
+ 
++/* Define if you have x86 SHA1 HW acceleration support. */
++#undef HAVE_X86_SHA1_HW_SUPPORT
++
+ /* Define to 1 if you have the `_doprnt' function. */
+ #undef HAVE__DOPRNT
+ 
+--- a/src/libiberty/configure
++++ b/src/libiberty/configure
+@@ -7406,6 +7406,64 @@ case "${host}" in
+ esac
+ 
+ 
++{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SHA1 HW acceleration support" >&5
++$as_echo_n "checking for SHA1 HW acceleration support... " >&6; }
++cat confdefs.h - <<_ACEOF >conftest.$ac_ext
++/* end confdefs.h.  */
++
++#include <x86intrin.h>
++#include <cpuid.h>
++
++__attribute__((__target__ ("sse4.1,sha")))
++void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1)
++{
++  __m128i abcd = _mm_loadu_si128 ((const __m128i *) buf);
++  __m128i e0 = _mm_set_epi32 (e, 0, 0, 0);
++  abcd = _mm_shuffle_epi32 (abcd, 0x1b);
++  const __m128i shuf_mask = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
++  abcd = _mm_shuffle_epi8 (abcd, shuf_mask);
++  e0 = _mm_sha1nexte_epu32 (e0, msg1);
++  abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
++  msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
++  msg0 = _mm_sha1msg2_epu32 (msg0, msg1);
++  msg0 = _mm_xor_si128 (msg0, msg1);
++  e0 = _mm_add_epi32 (e0, msg0);
++  e0 = abcd;
++  _mm_storeu_si128 (buf, abcd);
++  e = _mm_extract_epi32 (e0, 3);
++}
++
++int bar (void)
++{
++  unsigned int eax, ebx, ecx, edx;
++  if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
++      && (ebx & bit_SHA) != 0
++      && __get_cpuid (1, &eax, &ebx, &ecx, &edx)
++      && (ecx & bit_SSE4_1) != 0)
++    return 1;
++  return 0;
++}
++
++int
++main ()
++{
++bar ();
++  ;
++  return 0;
++}
++_ACEOF
++if ac_fn_c_try_compile "$LINENO"; then :
++  { $as_echo "$as_me:${as_lineno-$LINENO}: result: x86 SHA1" >&5
++$as_echo "x86 SHA1" >&6; }
++
++$as_echo "#define HAVE_X86_SHA1_HW_SUPPORT 1" >>confdefs.h
++
++else
++  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
++$as_echo "no" >&6; }
++fi
++rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
++
+ 
+ 
+ 
+--- a/src/libiberty/configure.ac
++++ b/src/libiberty/configure.ac
+@@ -728,6 +728,46 @@ case "${host}" in
+ esac
+ AC_SUBST(pexecute)
+ 
++AC_MSG_CHECKING([for SHA1 HW acceleration support])
++AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
++#include <x86intrin.h>
++#include <cpuid.h>
++
++__attribute__((__target__ ("sse4.1,sha")))
++void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1)
++{
++  __m128i abcd = _mm_loadu_si128 ((const __m128i *) buf);
++  __m128i e0 = _mm_set_epi32 (e, 0, 0, 0);
++  abcd = _mm_shuffle_epi32 (abcd, 0x1b);
++  const __m128i shuf_mask = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
++  abcd = _mm_shuffle_epi8 (abcd, shuf_mask);
++  e0 = _mm_sha1nexte_epu32 (e0, msg1);
++  abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
++  msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
++  msg0 = _mm_sha1msg2_epu32 (msg0, msg1);
++  msg0 = _mm_xor_si128 (msg0, msg1);
++  e0 = _mm_add_epi32 (e0, msg0);
++  e0 = abcd;
++  _mm_storeu_si128 (buf, abcd);
++  e = _mm_extract_epi32 (e0, 3);
++}
++
++int bar (void)
++{
++  unsigned int eax, ebx, ecx, edx;
++  if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
++      && (ebx & bit_SHA) != 0
++      && __get_cpuid (1, &eax, &ebx, &ecx, &edx)
++      && (ecx & bit_SSE4_1) != 0)
++    return 1;
++  return 0;
++}
++]], [[bar ();]])],
++  [AC_MSG_RESULT([x86 SHA1])
++  AC_DEFINE(HAVE_X86_SHA1_HW_SUPPORT, 1,
++	    [Define if you have x86 SHA1 HW acceleration support.])],
++  [AC_MSG_RESULT([no])])
++
+ libiberty_AC_FUNC_STRNCMP
+ 
+ # Install a library built with a cross compiler in $(tooldir) rather
+--- a/src/libiberty/sha1.c
++++ b/src/libiberty/sha1.c
+@@ -29,6 +29,11 @@
+ #include <stddef.h>
+ #include <string.h>
+ 
++#ifdef HAVE_X86_SHA1_HW_SUPPORT
++# include <x86intrin.h>
++# include <cpuid.h>
++#endif
++
+ #if USE_UNLOCKED_IO
+ # include "unlocked-io.h"
+ #endif
+@@ -412,3 +417,303 @@ sha1_process_block (const void *buffer,
+       e = ctx->E += e;
+     }
+ }
++
++#if defined(HAVE_X86_SHA1_HW_SUPPORT)
++/* HW specific version of sha1_process_bytes.  */
++
++static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *);
++
++static void
++sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
++{
++  /* When we already have some bits in our internal buffer concatenate
++     both inputs first.  */
++  if (ctx->buflen != 0)
++    {
++      size_t left_over = ctx->buflen;
++      size_t add = 128 - left_over > len ? len : 128 - left_over;
++
++      memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
++      ctx->buflen += add;
++
++      if (ctx->buflen > 64)
++	{
++	  sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
++
++	  ctx->buflen &= 63;
++	  /* The regions in the following copy operation cannot overlap.  */
++	  memcpy (ctx->buffer,
++		  &((char *) ctx->buffer)[(left_over + add) & ~63],
++		  ctx->buflen);
++	}
++
++      buffer = (const char *) buffer + add;
++      len -= add;
++    }
++
++  /* Process available complete blocks.  */
++  if (len >= 64)
++    {
++#if !_STRING_ARCH_unaligned
++# define alignof(type) offsetof (struct { char c; type x; }, x)
++# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
++      if (UNALIGNED_P (buffer))
++	while (len > 64)
++	  {
++	    sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
++	    buffer = (const char *) buffer + 64;
++	    len -= 64;
++	  }
++      else
++#endif
++	{
++	  sha1_hw_process_block (buffer, len & ~63, ctx);
++	  buffer = (const char *) buffer + (len & ~63);
++	  len &= 63;
++	}
++    }
++
++  /* Move remaining bytes in internal buffer.  */
++  if (len > 0)
++    {
++      size_t left_over = ctx->buflen;
++
++      memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
++      left_over += len;
++      if (left_over >= 64)
++	{
++	  sha1_hw_process_block (ctx->buffer, 64, ctx);
++	  left_over -= 64;
++	  memmove (ctx->buffer, &ctx->buffer[16], left_over);
++	}
++      ctx->buflen = left_over;
++    }
++}
++
++/* Process LEN bytes of BUFFER, accumulating context into CTX.
++   Using CPU specific intrinsics.  */
++
++#ifdef HAVE_X86_SHA1_HW_SUPPORT
++__attribute__((__target__ ("sse4.1,sha")))
++#endif
++static void
++sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
++{
++#ifdef HAVE_X86_SHA1_HW_SUPPORT
++  /* Implemented from
++     https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html  */
++  const __m128i *words = (const __m128i *) buffer;
++  const __m128i *endp = (const __m128i *) ((const char *) buffer + len);
++  __m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3;
++  const __m128i shuf_mask
++    = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
++  char check[((offsetof (struct sha1_ctx, B)
++	     == offsetof (struct sha1_ctx, A) + sizeof (ctx->A))
++		   && (offsetof (struct sha1_ctx, C)
++		       == offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A))
++		   && (offsetof (struct sha1_ctx, D)
++		       == offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A)))
++		  ? 1 : -1];
++
++  /* First increment the byte count.  RFC 1321 specifies the possible
++     length of the file up to 2^64 bits.  Here we only compute the
++     number of bytes.  Do a double word increment.  */
++  ctx->total[0] += len;
++  ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
++
++  (void) &check[0];
++  abcd = _mm_loadu_si128 ((const __m128i *) &ctx->A);
++  e0 = _mm_set_epi32 (ctx->E, 0, 0, 0);
++  abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
++
++  while (words < endp)
++    {
++      abcd_save = abcd;
++      e0_save = e0;
++
++      /* 0..3 */
++      msg0 = _mm_loadu_si128 (words);
++      msg0 = _mm_shuffle_epi8 (msg0, shuf_mask);
++      e0 = _mm_add_epi32 (e0, msg0);
++      e1 = abcd;
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
++
++      /* 4..7 */
++      msg1 = _mm_loadu_si128 (words + 1);
++      msg1 = _mm_shuffle_epi8 (msg1, shuf_mask);
++      e1 = _mm_sha1nexte_epu32 (e1, msg1);
++      e0 = abcd;
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
++      msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
++
++      /* 8..11 */
++      msg2 = _mm_loadu_si128 (words + 2);
++      msg2 = _mm_shuffle_epi8 (msg2, shuf_mask);
++      e0 = _mm_sha1nexte_epu32 (e0, msg2);
++      e1 = abcd;
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
++      msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
++      msg0 = _mm_xor_si128 (msg0, msg2);
++
++      /* 12..15 */
++      msg3 = _mm_loadu_si128 (words + 3);
++      msg3 = _mm_shuffle_epi8 (msg3, shuf_mask);
++      e1 = _mm_sha1nexte_epu32 (e1, msg3);
++      e0 = abcd;
++      msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
++      msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
++      msg1 = _mm_xor_si128 (msg1, msg3);
++
++      /* 16..19 */
++      e0 = _mm_sha1nexte_epu32 (e0, msg0);
++      e1 = abcd;
++      msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
++      msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
++      msg2 = _mm_xor_si128 (msg2, msg0);
++
++      /* 20..23 */
++      e1 = _mm_sha1nexte_epu32 (e1, msg1);
++      e0 = abcd;
++      msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
++      msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
++      msg3 = _mm_xor_si128 (msg3, msg1);
++
++      /* 24..27 */
++      e0 = _mm_sha1nexte_epu32 (e0, msg2);
++      e1 = abcd;
++      msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
++      msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
++      msg0 = _mm_xor_si128 (msg0, msg2);
++
++      /* 28..31 */
++      e1 = _mm_sha1nexte_epu32 (e1, msg3);
++      e0 = abcd;
++      msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
++      msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
++      msg1 = _mm_xor_si128 (msg1, msg3);
++
++      /* 32..35 */
++      e0 = _mm_sha1nexte_epu32 (e0, msg0);
++      e1 = abcd;
++      msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
++      msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
++      msg2 = _mm_xor_si128 (msg2, msg0);
++
++      /* 36..39 */
++      e1 = _mm_sha1nexte_epu32 (e1, msg1);
++      e0 = abcd;
++      msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
++      msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
++      msg3 = _mm_xor_si128 (msg3, msg1);
++
++      /* 40..43 */
++      e0 = _mm_sha1nexte_epu32 (e0, msg2);
++      e1 = abcd;
++      msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
++      msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
++      msg0 = _mm_xor_si128 (msg0, msg2);
++
++      /* 44..47 */
++      e1 = _mm_sha1nexte_epu32 (e1, msg3);
++      e0 = abcd;
++      msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
++      msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
++      msg1 = _mm_xor_si128 (msg1, msg3);
++
++      /* 48..51 */
++      e0 = _mm_sha1nexte_epu32 (e0, msg0);
++      e1 = abcd;
++      msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
++      msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
++      msg2 = _mm_xor_si128 (msg2, msg0);
++
++      /* 52..55 */
++      e1 = _mm_sha1nexte_epu32 (e1, msg1);
++      e0 = abcd;
++      msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
++      msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
++      msg3 = _mm_xor_si128 (msg3, msg1);
++
++      /* 56..59 */
++      e0 = _mm_sha1nexte_epu32 (e0, msg2);
++      e1 = abcd;
++      msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
++      msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
++      msg0 = _mm_xor_si128 (msg0, msg2);
++
++      /* 60..63 */
++      e1 = _mm_sha1nexte_epu32 (e1, msg3);
++      e0 = abcd;
++      msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
++      msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
++      msg1 = _mm_xor_si128 (msg1, msg3);
++
++      /* 64..67 */
++      e0 = _mm_sha1nexte_epu32 (e0, msg0);
++      e1 = abcd;
++      msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
++      msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
++      msg2 = _mm_xor_si128 (msg2, msg0);
++
++      /* 68..71 */
++      e1 = _mm_sha1nexte_epu32 (e1, msg1);
++      e0 = abcd;
++      msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
++      msg3 = _mm_xor_si128 (msg3, msg1);
++
++      /* 72..75 */
++      e0 = _mm_sha1nexte_epu32 (e0, msg2);
++      e1 = abcd;
++      msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
++      abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
++
++      /* 76..79 */
++      e1 = _mm_sha1nexte_epu32 (e1, msg3);
++      e0 = abcd;
++      abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
++
++      /* Finalize. */
++      e0 = _mm_sha1nexte_epu32 (e0, e0_save);
++      abcd = _mm_add_epi32 (abcd, abcd_save);
++
++      words = words + 4;
++    }
++
++  abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
++  _mm_storeu_si128 ((__m128i *) &ctx->A, abcd);
++  ctx->E = _mm_extract_epi32 (e0, 3);
++#endif
++}
++#endif
++
++/* Return sha1_process_bytes or some hardware optimized version thereof
++   depending on current CPU.  */
++
++sha1_process_bytes_fn
++sha1_choose_process_bytes (void)
++{
++#ifdef HAVE_X86_SHA1_HW_SUPPORT
++  unsigned int eax, ebx, ecx, edx;
++  if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
++      && (ebx & bit_SHA) != 0
++      && __get_cpuid (1, &eax, &ebx, &ecx, &edx)
++      && (ecx & bit_SSE4_1) != 0)
++    return sha1_hw_process_bytes;
++#endif
++  return sha1_process_bytes;
++}