1 files changed, 340 insertions, 0 deletions
diff --git a/mysys/crc32/crc32_arm64.c b/mysys/crc32/crc32_arm64.c
new file mode 100644
index 00000000..0e70c218
--- /dev/null
+++ b/mysys/crc32/crc32_arm64.c
@@ -0,0 +1,340 @@
+#include <my_global.h>
+#include <string.h>
+#include <stdint.h>
+
+static int pmull_supported;
+
+#if defined(HAVE_ARMV8_CRC)
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+
+int crc32_aarch64_available(void)
+{
+  int ret;
+  size_t len = sizeof(ret);
+  if (sysctlbyname("hw.optional.armv8_crc32", &ret, &len, NULL, 0) == -1)
+    return 0;
+  return ret;
+}
+
+const char *crc32c_aarch64_available(void)
+{
+  if (crc32_aarch64_available() == 0)
+    return NULL;
+  pmull_supported = 1;
+  return "Using ARMv8 crc32 + pmull instructions";
+}
+
+#else
+#include <sys/auxv.h>
+#if defined(__FreeBSD__)
+static unsigned long getauxval(unsigned int key)
+{
+  unsigned long val;
+  if (elf_aux_info(key, (void *)&val, (int)sizeof(val) != 0)
+    return 0ul;
+  return val;
+}
+#else
+# include <asm/hwcap.h>
+#endif
+
+#ifndef HWCAP_CRC32
+# define HWCAP_CRC32 (1 << 7)
+#endif
+
+#ifndef HWCAP_PMULL
+# define HWCAP_PMULL (1 << 4)
+#endif
+
+/* ARM made crc32 default from ARMv8.1 but optional in ARMv8A
+ * Runtime check API.
+ */
+int crc32_aarch64_available(void)
+{
+  unsigned long auxv= getauxval(AT_HWCAP);
+  return (auxv & HWCAP_CRC32) != 0;
+}
+
+const char *crc32c_aarch64_available(void)
+{
+  unsigned long auxv= getauxval(AT_HWCAP);
+
+  if (!(auxv & HWCAP_CRC32))
+    return NULL;
+
+  pmull_supported= (auxv & HWCAP_PMULL) != 0;
+  if (pmull_supported)
+    return "Using ARMv8 crc32 + pmull instructions";
+  else
+    return "Using ARMv8 crc32 instructions";
+}
+
+#endif /* __APPLE__ */
+#endif /* HAVE_ARMV8_CRC */
+
+#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+
+/* Request crc extension capabilities from the assembler */
+asm(".arch_extension crc");
+
+# ifdef HAVE_ARMV8_CRYPTO
+/* crypto extension  */
+asm(".arch_extension crypto");
+# endif
+
+#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+
+#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+
+
+#define CRC32C3X8(buffer, ITR) \
+  __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
+  __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
+  __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
+
+#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS  */
+
+/* Intrinsics header*/
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
+#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
+#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
+#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
+
+#define CRC32X(crc, value) (crc) = __crc32d((crc), (value))
+#define CRC32W(crc, value) (crc) = __crc32w((crc), (value))
+#define CRC32H(crc, value) (crc) = __crc32h((crc), (value))
+#define CRC32B(crc, value) (crc) = __crc32b((crc), (value))
+
+#define CRC32C3X8(buffer, ITR) \
+  crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
+  crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
+  crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
+
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+#define CRC32C7X3X8(buffer, ITR) do {\
+  CRC32C3X8(buffer, ((ITR) * 7 + 0)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 1)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 2)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 3)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 4)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 5)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
+} while(0)
+
+#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL1(buffer, PREF_OFFSET) \
+  PREF4X64L1(buffer,(PREF_OFFSET), 0) \
+  PREF4X64L1(buffer,(PREF_OFFSET), 4) \
+  PREF4X64L1(buffer,(PREF_OFFSET), 8) \
+  PREF4X64L1(buffer,(PREF_OFFSET), 12)
+
+#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
+  __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+  __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+  __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+  __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL2(buffer, PREF_OFFSET) \
+  PREF4X64L2(buffer,(PREF_OFFSET), 0) \
+  PREF4X64L2(buffer,(PREF_OFFSET), 4) \
+  PREF4X64L2(buffer,(PREF_OFFSET), 8) \
+  PREF4X64L2(buffer,(PREF_OFFSET), 12)
+
+uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
+{
+  uint32_t crc0, crc1, crc2;
+  int64_t length= (int64_t)len;
+
+  crc^= 0xffffffff;
+
+  /* Pmull runtime check here.
+   * Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030).
+   *
+   * Consider the condition that the target platform does support hardware crc32
+   * but not support PMULL. In this condition, it should leverage the aarch64
+   * crc32 instruction (__crc32c) and just only skip parallel computation (pmull/vmull)
+   * rather than skip all hardware crc32 instruction of computation.
+   */
+  if (pmull_supported)
+  {
+/* The following Macro (HAVE_ARMV8_CRYPTO) is used for compiling check */
+#ifdef HAVE_ARMV8_CRYPTO
+
+/* Crypto extension Support
+ * Parallel computation with 1024 Bytes (per block)
+ * Intrinsics Support
+ */
+# ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+    const poly64_t k1= 0xe417f38a, k2= 0x8f158014;
+    uint64_t t0, t1;
+
+    /* Process per block size of 1024 Bytes
+     * A block size = 8 + 42*3*sizeof(uint64_t) + 8
+     */
+    while ((length-= 1024) >= 0)
+    {
+      /* Prefetch 3*1024 data for avoiding L2 cache miss */
+      PREF1KL2(buffer, 1024*3);
+      /* Do first 8 bytes here for better pipelining */
+      crc0= __crc32cd(crc, *(const uint64_t *)buffer);
+      crc1= 0;
+      crc2= 0;
+      buffer+= sizeof(uint64_t);
+
+      /* Process block inline
+       * Process crc0 last to avoid dependency with above
+       */
+      CRC32C7X3X8(buffer, 0);
+      CRC32C7X3X8(buffer, 1);
+      CRC32C7X3X8(buffer, 2);
+      CRC32C7X3X8(buffer, 3);
+      CRC32C7X3X8(buffer, 4);
+      CRC32C7X3X8(buffer, 5);
+
+      buffer+= 42*3*sizeof(uint64_t);
+      /* Prefetch data for following block to avoid L1 cache miss */
+      PREF1KL1(buffer, 1024);
+
+      /* Last 8 bytes
+       * Merge crc0 and crc1 into crc2
+       * crc1 multiply by K2
+       * crc0 multiply by K1
+       */
+      t1= (uint64_t)vmull_p64(crc1, k2);
+      t0= (uint64_t)vmull_p64(crc0, k1);
+      crc= __crc32cd(crc2, *(const uint64_t *)buffer);
+      crc1= __crc32cd(0, t1);
+      crc^= crc1;
+      crc0= __crc32cd(0, t0);
+      crc^= crc0;
+
+      buffer+= sizeof(uint64_t);
+    }
+
+# else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+    /*No intrinsics*/
+    __asm__("mov    x16,            #0xf38a         \n\t"
+            "movk   x16,            #0xe417, lsl 16 \n\t"
+            "mov    v1.2d[0],       x16             \n\t"
+            "mov    x16,            #0x8014         \n\t"
+            "movk   x16,            #0x8f15, lsl 16 \n\t"
+            "mov    v0.2d[0],       x16             \n\t"
+            :::"x16");
+
+    while ((length-= 1024) >= 0)
+    {
+      PREF1KL2(buffer, 1024*3);
+      __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
+              :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
+      crc1= 0;
+      crc2= 0;
+      buffer+= sizeof(uint64_t);
+
+      CRC32C7X3X8(buffer, 0);
+      CRC32C7X3X8(buffer, 1);
+      CRC32C7X3X8(buffer, 2);
+      CRC32C7X3X8(buffer, 3);
+      CRC32C7X3X8(buffer, 4);
+      CRC32C7X3X8(buffer, 5);
+
+      buffer+= 42*3*sizeof(uint64_t);
+      PREF1KL1(buffer, 1024);
+      __asm__("mov            v2.2d[0],       %x[c1]          \n\t"
+              "pmull          v2.1q,          v2.1d,  v0.1d   \n\t"
+              "mov            v3.2d[0],       %x[c0]          \n\t"
+              "pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+              "crc32cx        %w[c],          %w[c2], %x[v]   \n\t"
+              "mov            %x[c1],         v2.2d[0]        \n\t"
+              "crc32cx        %w[c1],         wzr,    %x[c1]  \n\t"
+              "eor            %w[c],          %w[c],  %w[c1]  \n\t"
+              "mov            %x[c0],         v3.2d[0]        \n\t"
+              "crc32cx        %w[c0],         wzr,    %x[c0]  \n\t"
+              "eor            %w[c],          %w[c],  %w[c0]  \n\t"
+              :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
+              :[v]"r"(*((const uint64_t *)buffer)));
+      buffer+= sizeof(uint64_t);
+    }
+# endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+    /* Done if Input data size is aligned with 1024  */
+    if (!(length+= 1024))
+      return ~crc;
+
+#endif /* HAVE_ARMV8_CRYPTO */
+
+  }  // end if pmull_supported
+
+  while ((length-= sizeof(uint64_t)) >= 0)
+  {
+    CRC32CX(crc, *(uint64_t *)buffer);
+    buffer+= sizeof(uint64_t);
+  }
+
+  /* The following is more efficient than the straight loop */
+  if (length & sizeof(uint32_t))
+  {
+    CRC32CW(crc, *(uint32_t *)buffer);
+    buffer+= sizeof(uint32_t);
+  }
+
+  if (length & sizeof(uint16_t))
+  {
+    CRC32CH(crc, *(uint16_t *)buffer);
+    buffer+= sizeof(uint16_t);
+  }
+
+  if (length & sizeof(uint8_t))
+    CRC32CB(crc, *buffer);
+
+  return ~crc;
+}
+
+/* There are multiple approaches to calculate crc.
+Approach-1: Process 8 bytes then 4 bytes then 2 bytes and then 1 bytes
+Approach-2: Process 8 bytes and remaining workload using 1 bytes
+Apporach-3: Process 64 bytes at once by issuing 8 crc call and remaining
+            using 8/1 combination.
+
+Based on micro-benchmark testing we found that Approach-2 works best especially
+given small chunk of variable data. */
+unsigned int crc32_aarch64(unsigned int crc, const void *buf, size_t len)
+{
+  const uint8_t *buf1= buf;
+  const uint64_t *buf8= (const uint64_t *) (((uintptr_t) buf + 7) & ~7);
+
+  crc= ~crc;
+
+  /* if start pointer is not 8 bytes aligned */
+  while ((buf1 != (const uint8_t *) buf8) && len)
+  {
+    CRC32B(crc, *buf1++);
+    len--;
+  }
+
+  for (; len >= 8; len-= 8)
+    CRC32X(crc, *buf8++);
+
+  buf1= (const uint8_t *) buf8;
+  while (len--)
+    CRC32B(crc, *buf1++);
+
+  return ~crc;
+}