diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/intel-ipsec-mb/include/constant_lookup.asm | |
parent | Initial commit. (diff) | |
download | ceph-upstream.tar.xz ceph-upstream.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/spdk/intel-ipsec-mb/include/constant_lookup.asm | 561 |
1 files changed, 561 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/include/constant_lookup.asm b/src/spdk/intel-ipsec-mb/include/constant_lookup.asm new file mode 100644 index 000000000..a3c81dc75 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/constant_lookup.asm @@ -0,0 +1,561 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/reg_sizes.asm" + +section .data +default rel + +align 16 +idx_tab8: + db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + db 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + +align 16 +add_16: + db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 + +align 16 +idx_tab16: + dw 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + +align 16 +add_8: + dw 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8 + +align 16 +idx_tab32: + dd 0x0, 0x1, 0x2, 0x3 + +align 16 +add_4: + dd 0x4, 0x4, 0x4, 0x4 + +align 16 +idx_tab64: + dq 0x0, 0x1 + +add_2: + dq 0x2, 0x2 + +align 16 +bcast_mask: + db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 + +section .text + +%ifdef LINUX + %define arg1 rdi + %define arg2 rsi + %define arg3 rdx +%else + %define arg1 rcx + %define arg2 rdx + %define arg3 r8 +%endif + +%define bcast_idx xmm0 +%define xadd xmm1 +%define accum_val xmm2 +%define xindices xmm3 +%define xtmp xmm4 +%define xtmp2 xmm5 +%define tmp r9 +%define offset r10 + +%define table arg1 +%define idx arg2 +%define size arg3 + +; uint8_t lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up (multiple of 16 bytes) +MKGLOBAL(lookup_8bit_sse,function,internal) +lookup_8bit_sse: + + ;; Number of loop iters = matrix size / 4 (number of values in XMM) + shr size, 4 + je exit8_sse + + xor offset, offset + + ;; Broadcast idx to look up + movd bcast_idx, DWORD(idx) + pxor xtmp, xtmp + pxor accum_val, accum_val + pshufb bcast_idx, xtmp + + movdqa xadd, [rel add_16] + movdqa xindices, [rel idx_tab8] + +loop8_sse: + movdqa xtmp, xindices + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + pcmpeqb xtmp, bcast_idx + + ;; Load next 16 values + movdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + pand xtmp2, xtmp + + por accum_val, xtmp2 + + ;; Get next 16 indices + paddb xindices, xadd + + add offset, 16 + dec size + + jne loop8_sse + + ;; Extract value from XMM register + movdqa xtmp, accum_val + pslldq xtmp, 8 ; shift left by 64 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 4 ; shift left by 32 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 2 ; shift left by 16 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 1 ; shift left by 8 bits + por accum_val, xtmp + + pextrb rax, accum_val, 15 + +exit8_sse: + ret + +; uint8_t lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up (multiple of 16 bytes) +MKGLOBAL(lookup_8bit_avx,function,internal) +lookup_8bit_avx: + ;; Number of loop iters = matrix size / 4 (number of values in XMM) + shr size, 4 + je exit8_avx + + xor offset, offset + + ;; Broadcast idx to look up + vmovd bcast_idx, DWORD(idx) + vpxor xtmp, xtmp + vpxor accum_val, accum_val + vpshufb bcast_idx, xtmp + + vmovdqa xadd, [rel add_16] + vmovdqa xindices, [rel idx_tab8] + +loop8_avx: + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + vpcmpeqb xtmp, xindices, bcast_idx + + ;; Load next 16 values + vmovdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + vpand xtmp2, xtmp + + vpor accum_val, xtmp2 + + ;; Get next 16 indices + vpaddb xindices, xadd + + add offset, 16 + dec size + + jne loop8_avx + + ;; Extract value from XMM register + vpslldq xtmp, accum_val, 8 ; shift left by 64 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 4 ; shift left by 32 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 2 ; shift left by 16 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 1 ; shift left by 8 bits + vpor accum_val, xtmp + + vpextrb rax, accum_val, 15 + +exit8_avx: + + ret + +; uint8_t lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_16bit_sse,function,internal) +lookup_16bit_sse: + + ;; Number of loop iters = matrix size / 8 (number of values in XMM) + shr size, 3 + je exit16_sse + + xor offset, offset + + ;; Broadcast idx to look up + movd bcast_idx, DWORD(idx) + movdqa xtmp, [rel bcast_mask] + pxor accum_val, accum_val + pshufb bcast_idx, xtmp + + movdqa xadd, [rel add_8] + movdqa xindices, [rel idx_tab16] + +loop16_sse: + + movdqa xtmp, xindices + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + pcmpeqw xtmp, bcast_idx + + ;; Load next 8 values + movdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + pand xtmp2, xtmp + + por accum_val, xtmp2 + + ;; Get next 8 indices + paddw xindices, xadd + add offset, 16 + dec size + + jne loop16_sse + + ;; Extract value from XMM register + movdqa xtmp, accum_val + pslldq xtmp, 8 ; shift left by 64 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 4 ; shift left by 32 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 2 ; shift left by 16 bits + por accum_val, xtmp + + pextrw rax, accum_val, 7 + +exit16_sse: + ret + +; uint8_t lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_16bit_avx,function,internal) +lookup_16bit_avx: + + ;; Number of loop iters = matrix size / 8 (number of values in XMM) + shr size, 3 + je exit16_avx + + xor offset, offset + + ;; Broadcast idx to look up + vmovd bcast_idx, DWORD(idx) + vmovdqa xtmp, [rel bcast_mask] + vpxor accum_val, accum_val + vpshufb bcast_idx, xtmp + + vmovdqa xadd, [rel add_8] + vmovdqa xindices, [rel idx_tab16] + +loop16_avx: + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + vpcmpeqw xtmp, xindices, bcast_idx + + ;; Load next 16 values + vmovdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + vpand xtmp2, xtmp + + vpor accum_val, xtmp2 + + ;; Get next 8 indices + vpaddw xindices, xadd + add offset, 16 + dec size + + jne loop16_avx + + ;; Extract value from XMM register + vpslldq xtmp, accum_val, 8 ; shift left by 64 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 4 ; shift left by 32 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 2 ; shift left by 16 bits + vpor accum_val, xtmp + + vpextrw rax, accum_val, 7 + +exit16_avx: + ret + +; uint32_t lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_32bit_sse,function,internal) +lookup_32bit_sse: + + ;; Number of loop iters = matrix size / 4 (number of values in XMM) + shr size, 2 + je exit32_sse + + xor offset, offset + + ;; Broadcast idx to look up + movd bcast_idx, DWORD(idx) + pxor accum_val, accum_val + pshufd bcast_idx, bcast_idx, 0 + + movdqa xadd, [rel add_4] + movdqa xindices, [rel idx_tab32] + +loop32_sse: + movdqa xtmp, xindices + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + pcmpeqd xtmp, bcast_idx + + ;; Load next 4 values + movdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + pand xtmp2, xtmp + + por accum_val, xtmp2 + + ;; Get next 4 indices + paddd xindices, xadd + add offset, 16 + dec size + + jne loop32_sse + + ;; Extract value from XMM register + movdqa xtmp, accum_val + psrldq xtmp, 8 ; shift right by 64 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + psrldq xtmp, 4 ; shift right by 32 bits + por accum_val, xtmp + + movd eax, accum_val + +exit32_sse: + ret + + +; uint32_t lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_32bit_avx,function,internal) +lookup_32bit_avx: + ;; Number of loop iters = matrix size / 4 (number of values in XMM) + shr size, 2 + je exit32_avx + + xor offset, offset + + ;; Broadcast idx to look up + vmovd bcast_idx, DWORD(idx) + vpxor accum_val, accum_val + vpshufd bcast_idx, bcast_idx, 0 + + vmovdqa xadd, [rel add_4] + vmovdqa xindices, [rel idx_tab32] + +loop32_avx: + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + vpcmpeqd xtmp, xindices, bcast_idx + + ;; Load next 4 values + vmovdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + vpand xtmp2, xtmp + + vpor accum_val, xtmp2 + + ;; Get next 4 indices + vpaddd xindices, xadd + add offset, 16 + dec size + + jne loop32_avx + + ;; Extract value from XMM register + vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits + vpor accum_val, xtmp + + vpsrldq xtmp, accum_val, 4 ; shift right by 32 bits + vpor accum_val, xtmp + + vmovd eax, accum_val + +exit32_avx: + ret + + +; uint64_t lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_64bit_sse,function,internal) +lookup_64bit_sse: + ;; Number of loop iters = matrix size / 2 (number of values in XMM) + shr size, 1 + je exit64_sse + + xor offset, offset + + ;; Broadcast idx to look up + movq bcast_idx, idx + pxor accum_val, accum_val + pinsrq bcast_idx, idx, 1 + + movdqa xadd, [rel add_2] + movdqa xindices, [rel idx_tab64] + +loop64_sse: + movdqa xtmp, xindices + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + pcmpeqq xtmp, bcast_idx + + ;; Load next 2 values + movdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + pand xtmp2, xtmp + + por accum_val, xtmp2 + + ;; Get next 2 indices + paddq xindices, xadd + add offset, 16 + dec size + + jne loop64_sse + + ;; Extract value from XMM register + movdqa xtmp, accum_val + psrldq xtmp, 8 ; shift right by 64 bits + por accum_val, xtmp + + movq rax, accum_val + +exit64_sse: + ret + + +; uint64_t lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_64bit_avx,function,internal) +lookup_64bit_avx: + ;; Number of loop iters = matrix size / 2 (number of values in XMM) + shr size, 1 + je exit64_avx + + xor offset, offset + + vmovq bcast_idx, idx + vpxor accum_val, accum_val + vpinsrq bcast_idx, idx, 1 + + vmovdqa xadd, [rel add_2] + vmovdqa xindices, [rel idx_tab64] + +loop64_avx: + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + vpcmpeqq xtmp, xindices, bcast_idx + + ;; Load next 2 values + vmovdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + vpand xtmp2, xtmp + + vpor accum_val, xtmp2 + + ;; Get next 2 indices + vpaddq xindices, xadd + add offset, 16 + dec size + + jne loop64_avx + + ;; Extract value from XMM register + vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits + vpor accum_val, xtmp + + vmovq rax, accum_val + +exit64_avx: + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |