summaryrefslogtreecommitdiffstats
path: root/src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_nt_avx.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_nt_avx.c')
-rw-r--r--src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_nt_avx.c443
1 files changed, 443 insertions, 0 deletions
diff --git a/src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_nt_avx.c b/src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_nt_avx.c
new file mode 100644
index 000000000..ff007fb3c
--- /dev/null
+++ b/src/pmdk/src/libpmem2/x86_64/memcpy/memcpy_nt_avx.c
@@ -0,0 +1,443 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* Copyright 2017-2020, Intel Corporation */
+
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pmem2_arch.h"
+#include "avx.h"
+#include "flush.h"
+#include "memcpy_memset.h"
+#include "memcpy_avx.h"
+#include "valgrind_internal.h"
+
+static force_inline __m256i
+mm256_loadu_si256(const char *src, unsigned idx)
+{
+ return _mm256_loadu_si256((const __m256i *)src + idx);
+}
+
+static force_inline void
+mm256_stream_si256(char *dest, unsigned idx, __m256i src)
+{
+ _mm256_stream_si256((__m256i *)dest + idx, src);
+ barrier();
+}
+
+static force_inline void
+memmove_movnt8x64b(char *dest, const char *src)
+{
+ __m256i ymm0 = mm256_loadu_si256(src, 0);
+ __m256i ymm1 = mm256_loadu_si256(src, 1);
+ __m256i ymm2 = mm256_loadu_si256(src, 2);
+ __m256i ymm3 = mm256_loadu_si256(src, 3);
+ __m256i ymm4 = mm256_loadu_si256(src, 4);
+ __m256i ymm5 = mm256_loadu_si256(src, 5);
+ __m256i ymm6 = mm256_loadu_si256(src, 6);
+ __m256i ymm7 = mm256_loadu_si256(src, 7);
+ __m256i ymm8 = mm256_loadu_si256(src, 8);
+ __m256i ymm9 = mm256_loadu_si256(src, 9);
+ __m256i ymm10 = mm256_loadu_si256(src, 10);
+ __m256i ymm11 = mm256_loadu_si256(src, 11);
+ __m256i ymm12 = mm256_loadu_si256(src, 12);
+ __m256i ymm13 = mm256_loadu_si256(src, 13);
+ __m256i ymm14 = mm256_loadu_si256(src, 14);
+ __m256i ymm15 = mm256_loadu_si256(src, 15);
+
+ mm256_stream_si256(dest, 0, ymm0);
+ mm256_stream_si256(dest, 1, ymm1);
+ mm256_stream_si256(dest, 2, ymm2);
+ mm256_stream_si256(dest, 3, ymm3);
+ mm256_stream_si256(dest, 4, ymm4);
+ mm256_stream_si256(dest, 5, ymm5);
+ mm256_stream_si256(dest, 6, ymm6);
+ mm256_stream_si256(dest, 7, ymm7);
+ mm256_stream_si256(dest, 8, ymm8);
+ mm256_stream_si256(dest, 9, ymm9);
+ mm256_stream_si256(dest, 10, ymm10);
+ mm256_stream_si256(dest, 11, ymm11);
+ mm256_stream_si256(dest, 12, ymm12);
+ mm256_stream_si256(dest, 13, ymm13);
+ mm256_stream_si256(dest, 14, ymm14);
+ mm256_stream_si256(dest, 15, ymm15);
+}
+
+static force_inline void
+memmove_movnt4x64b(char *dest, const char *src)
+{
+ __m256i ymm0 = mm256_loadu_si256(src, 0);
+ __m256i ymm1 = mm256_loadu_si256(src, 1);
+ __m256i ymm2 = mm256_loadu_si256(src, 2);
+ __m256i ymm3 = mm256_loadu_si256(src, 3);
+ __m256i ymm4 = mm256_loadu_si256(src, 4);
+ __m256i ymm5 = mm256_loadu_si256(src, 5);
+ __m256i ymm6 = mm256_loadu_si256(src, 6);
+ __m256i ymm7 = mm256_loadu_si256(src, 7);
+
+ mm256_stream_si256(dest, 0, ymm0);
+ mm256_stream_si256(dest, 1, ymm1);
+ mm256_stream_si256(dest, 2, ymm2);
+ mm256_stream_si256(dest, 3, ymm3);
+ mm256_stream_si256(dest, 4, ymm4);
+ mm256_stream_si256(dest, 5, ymm5);
+ mm256_stream_si256(dest, 6, ymm6);
+ mm256_stream_si256(dest, 7, ymm7);
+}
+
+static force_inline void
+memmove_movnt2x64b(char *dest, const char *src)
+{
+ __m256i ymm0 = mm256_loadu_si256(src, 0);
+ __m256i ymm1 = mm256_loadu_si256(src, 1);
+ __m256i ymm2 = mm256_loadu_si256(src, 2);
+ __m256i ymm3 = mm256_loadu_si256(src, 3);
+
+ mm256_stream_si256(dest, 0, ymm0);
+ mm256_stream_si256(dest, 1, ymm1);
+ mm256_stream_si256(dest, 2, ymm2);
+ mm256_stream_si256(dest, 3, ymm3);
+}
+
+static force_inline void
+memmove_movnt1x64b(char *dest, const char *src)
+{
+ __m256i ymm0 = mm256_loadu_si256(src, 0);
+ __m256i ymm1 = mm256_loadu_si256(src, 1);
+
+ mm256_stream_si256(dest, 0, ymm0);
+ mm256_stream_si256(dest, 1, ymm1);
+}
+
+static force_inline void
+memmove_movnt1x32b(char *dest, const char *src)
+{
+ __m256i ymm0 = _mm256_loadu_si256((__m256i *)src);
+
+ mm256_stream_si256(dest, 0, ymm0);
+}
+
+static force_inline void
+memmove_movnt1x16b(char *dest, const char *src)
+{
+ __m128i xmm0 = _mm_loadu_si128((__m128i *)src);
+
+ _mm_stream_si128((__m128i *)dest, xmm0);
+}
+
+static force_inline void
+memmove_movnt1x8b(char *dest, const char *src)
+{
+ _mm_stream_si64((long long *)dest, *(long long *)src);
+}
+
+static force_inline void
+memmove_movnt1x4b(char *dest, const char *src)
+{
+ _mm_stream_si32((int *)dest, *(int *)src);
+}
+
+static force_inline void
+memmove_movnt_avx_fw(char *dest, const char *src, size_t len, flush_fn flush,
+ perf_barrier_fn perf_barrier)
+{
+ size_t cnt = (uint64_t)dest & 63;
+ if (cnt > 0) {
+ cnt = 64 - cnt;
+
+ if (cnt > len)
+ cnt = len;
+
+ memmove_small_avx(dest, src, cnt, flush);
+
+ dest += cnt;
+ src += cnt;
+ len -= cnt;
+ }
+
+ const char *srcend = src + len;
+ prefetch_ini_fw(src, len);
+
+ while (len >= PERF_BARRIER_SIZE) {
+ prefetch_next_fw(src, srcend);
+
+ memmove_movnt8x64b(dest, src);
+ dest += 8 * 64;
+ src += 8 * 64;
+ len -= 8 * 64;
+
+ memmove_movnt4x64b(dest, src);
+ dest += 4 * 64;
+ src += 4 * 64;
+ len -= 4 * 64;
+
+ COMPILE_ERROR_ON(PERF_BARRIER_SIZE != (8 + 4) * 64);
+
+ if (len)
+ perf_barrier();
+ }
+
+ if (len >= 8 * 64) {
+ memmove_movnt8x64b(dest, src);
+ dest += 8 * 64;
+ src += 8 * 64;
+ len -= 8 * 64;
+ }
+
+ if (len >= 4 * 64) {
+ memmove_movnt4x64b(dest, src);
+ dest += 4 * 64;
+ src += 4 * 64;
+ len -= 4 * 64;
+ }
+
+ if (len >= 2 * 64) {
+ memmove_movnt2x64b(dest, src);
+ dest += 2 * 64;
+ src += 2 * 64;
+ len -= 2 * 64;
+ }
+
+ if (len >= 1 * 64) {
+ memmove_movnt1x64b(dest, src);
+
+ dest += 1 * 64;
+ src += 1 * 64;
+ len -= 1 * 64;
+ }
+
+ if (len == 0)
+ goto end;
+
+ /* There's no point in using more than 1 nt store for 1 cache line. */
+ if (util_is_pow2(len)) {
+ if (len == 32)
+ memmove_movnt1x32b(dest, src);
+ else if (len == 16)
+ memmove_movnt1x16b(dest, src);
+ else if (len == 8)
+ memmove_movnt1x8b(dest, src);
+ else if (len == 4)
+ memmove_movnt1x4b(dest, src);
+ else
+ goto nonnt;
+
+ goto end;
+ }
+
+nonnt:
+ memmove_small_avx(dest, src, len, flush);
+end:
+ avx_zeroupper();
+}
+
+static force_inline void
+memmove_movnt_avx_bw(char *dest, const char *src, size_t len, flush_fn flush,
+ perf_barrier_fn perf_barrier)
+{
+ dest += len;
+ src += len;
+
+ size_t cnt = (uint64_t)dest & 63;
+ if (cnt > 0) {
+ if (cnt > len)
+ cnt = len;
+
+ dest -= cnt;
+ src -= cnt;
+ len -= cnt;
+
+ memmove_small_avx(dest, src, cnt, flush);
+ }
+
+ const char *srcbegin = src - len;
+ prefetch_ini_bw(src, len);
+
+ while (len >= PERF_BARRIER_SIZE) {
+ prefetch_next_bw(src, srcbegin);
+
+ dest -= 8 * 64;
+ src -= 8 * 64;
+ len -= 8 * 64;
+ memmove_movnt8x64b(dest, src);
+
+ dest -= 4 * 64;
+ src -= 4 * 64;
+ len -= 4 * 64;
+ memmove_movnt4x64b(dest, src);
+
+ COMPILE_ERROR_ON(PERF_BARRIER_SIZE != (8 + 4) * 64);
+
+ if (len)
+ perf_barrier();
+ }
+
+ if (len >= 8 * 64) {
+ dest -= 8 * 64;
+ src -= 8 * 64;
+ len -= 8 * 64;
+ memmove_movnt8x64b(dest, src);
+ }
+
+ if (len >= 4 * 64) {
+ dest -= 4 * 64;
+ src -= 4 * 64;
+ len -= 4 * 64;
+ memmove_movnt4x64b(dest, src);
+ }
+
+ if (len >= 2 * 64) {
+ dest -= 2 * 64;
+ src -= 2 * 64;
+ len -= 2 * 64;
+ memmove_movnt2x64b(dest, src);
+ }
+
+ if (len >= 1 * 64) {
+ dest -= 1 * 64;
+ src -= 1 * 64;
+ len -= 1 * 64;
+ memmove_movnt1x64b(dest, src);
+ }
+
+ if (len == 0)
+ goto end;
+
+ /* There's no point in using more than 1 nt store for 1 cache line. */
+ if (util_is_pow2(len)) {
+ if (len == 32) {
+ dest -= 32;
+ src -= 32;
+ memmove_movnt1x32b(dest, src);
+ } else if (len == 16) {
+ dest -= 16;
+ src -= 16;
+ memmove_movnt1x16b(dest, src);
+ } else if (len == 8) {
+ dest -= 8;
+ src -= 8;
+ memmove_movnt1x8b(dest, src);
+ } else if (len == 4) {
+ dest -= 4;
+ src -= 4;
+ memmove_movnt1x4b(dest, src);
+ } else {
+ goto nonnt;
+ }
+
+ goto end;
+ }
+
+nonnt:
+ dest -= len;
+ src -= len;
+ memmove_small_avx(dest, src, len, flush);
+end:
+ avx_zeroupper();
+}
+
+static force_inline void
+memmove_movnt_avx(char *dest, const char *src, size_t len, flush_fn flush,
+ barrier_fn barrier, perf_barrier_fn perf_barrier)
+{
+ if ((uintptr_t)dest - (uintptr_t)src >= len)
+ memmove_movnt_avx_fw(dest, src, len, flush, perf_barrier);
+ else
+ memmove_movnt_avx_bw(dest, src, len, flush, perf_barrier);
+
+ barrier();
+
+ VALGRIND_DO_FLUSH(dest, len);
+}
+
+/* variants without perf_barrier */
+
+void
+memmove_movnt_avx_noflush_nobarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, noflush, barrier_after_ntstores,
+ no_barrier);
+}
+
+void
+memmove_movnt_avx_empty_nobarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, flush_empty_nolog,
+ barrier_after_ntstores, no_barrier);
+}
+void
+memmove_movnt_avx_clflush_nobarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, flush_clflush_nolog,
+ barrier_after_ntstores, no_barrier);
+}
+
+void
+memmove_movnt_avx_clflushopt_nobarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, flush_clflushopt_nolog,
+ no_barrier_after_ntstores, no_barrier);
+}
+
+void
+memmove_movnt_avx_clwb_nobarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, flush_clwb_nolog,
+ no_barrier_after_ntstores, no_barrier);
+}
+
+/* variants with perf_barrier */
+
+void
+memmove_movnt_avx_noflush_wcbarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, noflush, barrier_after_ntstores,
+ wc_barrier);
+}
+
+void
+memmove_movnt_avx_empty_wcbarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, flush_empty_nolog,
+ barrier_after_ntstores, wc_barrier);
+}
+void
+memmove_movnt_avx_clflush_wcbarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, flush_clflush_nolog,
+ barrier_after_ntstores, wc_barrier);
+}
+
+void
+memmove_movnt_avx_clflushopt_wcbarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, flush_clflushopt_nolog,
+ no_barrier_after_ntstores, wc_barrier);
+}
+
+void
+memmove_movnt_avx_clwb_wcbarrier(char *dest, const char *src, size_t len)
+{
+ LOG(15, "dest %p src %p len %zu", dest, src, len);
+
+ memmove_movnt_avx(dest, src, len, flush_clwb_nolog,
+ no_barrier_after_ntstores, wc_barrier);
+}