summaryrefslogtreecommitdiffstats
path: root/src/isa-l/crc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/isa-l/crc/Makefile.am64
-rw-r--r--src/isa-l/crc/crc16_t10dif_01.asm665
-rw-r--r--src/isa-l/crc/crc16_t10dif_by4.asm562
-rw-r--r--src/isa-l/crc/crc16_t10dif_perf.c87
-rw-r--r--src/isa-l/crc/crc16_t10dif_test.c167
-rw-r--r--src/isa-l/crc/crc32_ieee_01.asm655
-rw-r--r--src/isa-l/crc/crc32_ieee_by4.asm565
-rw-r--r--src/isa-l/crc/crc32_ieee_perf.c87
-rw-r--r--src/isa-l/crc/crc32_ieee_test.c174
-rw-r--r--src/isa-l/crc/crc32_iscsi_00.asm656
-rw-r--r--src/isa-l/crc/crc32_iscsi_01.asm572
-rw-r--r--src/isa-l/crc/crc32_iscsi_perf.c87
-rw-r--r--src/isa-l/crc/crc32_iscsi_test.c171
-rw-r--r--src/isa-l/crc/crc64_base.c159
-rw-r--r--src/isa-l/crc/crc64_ecma_norm_by8.asm583
-rw-r--r--src/isa-l/crc/crc64_ecma_refl_by8.asm548
-rw-r--r--src/isa-l/crc/crc64_example.c68
-rw-r--r--src/isa-l/crc/crc64_funcs_perf.c109
-rw-r--r--src/isa-l/crc/crc64_funcs_test.c290
-rw-r--r--src/isa-l/crc/crc64_iso_norm_by8.asm581
-rw-r--r--src/isa-l/crc/crc64_iso_refl_by8.asm544
-rw-r--r--src/isa-l/crc/crc64_jones_norm_by8.asm581
-rw-r--r--src/isa-l/crc/crc64_jones_refl_by8.asm544
-rw-r--r--src/isa-l/crc/crc64_multibinary.asm89
-rw-r--r--src/isa-l/crc/crc_base.c170
-rw-r--r--src/isa-l/crc/crc_base_aliases.c77
-rw-r--r--src/isa-l/crc/crc_multibinary.asm180
-rw-r--r--src/isa-l/crc/crc_simple_test.c63
28 files changed, 9098 insertions, 0 deletions
diff --git a/src/isa-l/crc/Makefile.am b/src/isa-l/crc/Makefile.am
new file mode 100644
index 00000000..74b96474
--- /dev/null
+++ b/src/isa-l/crc/Makefile.am
@@ -0,0 +1,64 @@
+########################################################################
+# Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc += \
+ crc/crc_base.c \
+ crc/crc64_base.c
+
+lsrc_base_aliases += crc/crc_base_aliases.c
+lsrc_x86_32 += crc/crc_base_aliases.c
+
+lsrc_x86_64 += \
+ crc/crc16_t10dif_01.asm \
+ crc/crc16_t10dif_by4.asm \
+ crc/crc32_ieee_01.asm \
+ crc/crc32_ieee_by4.asm \
+ crc/crc32_iscsi_01.asm \
+ crc/crc32_iscsi_00.asm \
+ crc/crc_multibinary.asm \
+ crc/crc64_multibinary.asm \
+ crc/crc64_ecma_refl_by8.asm \
+ crc/crc64_ecma_norm_by8.asm \
+ crc/crc64_iso_refl_by8.asm \
+ crc/crc64_iso_norm_by8.asm \
+ crc/crc64_jones_refl_by8.asm \
+ crc/crc64_jones_norm_by8.asm
+
+src_include += -I $(srcdir)/crc
+extern_hdrs += include/crc.h include/crc64.h
+
+other_src += include/reg_sizes.asm include/types.h include/test.h
+
+check_tests += crc/crc16_t10dif_test crc/crc32_ieee_test crc/crc32_iscsi_test \
+ crc/crc64_funcs_test
+
+perf_tests += crc/crc16_t10dif_perf crc/crc32_ieee_perf crc/crc32_iscsi_perf \
+ crc/crc64_funcs_perf
+
+examples += crc/crc_simple_test crc/crc64_example
diff --git a/src/isa-l/crc/crc16_t10dif_01.asm b/src/isa-l/crc/crc16_t10dif_01.asm
new file mode 100644
index 00000000..fb9c73c8
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_01.asm
@@ -0,0 +1,665 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; UINT16 crc16_t10dif_01(
+; UINT16 init_crc, //initial CRC value, 16 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+global crc16_t10dif_01:function
+crc16_t10dif_01:
+
+ ; adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl arg1_low32, 16
+
+ ; After this point, code flow is exactly same as a 32-bit CRC.
+ ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
+
+ sub rsp, VARIABLE_OFFSET
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp+16*2],xmm6
+ movdqa [rsp+16*3],xmm7
+ movdqa [rsp+16*4],xmm8
+ movdqa [rsp+16*5],xmm9
+ movdqa [rsp+16*6],xmm10
+ movdqa [rsp+16*7],xmm11
+ movdqa [rsp+16*8],xmm12
+ movdqa [rsp+16*9],xmm13
+%endif
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movd xmm10, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 12
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x1
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm7
+
+ pand xmm0, [mask2]
+
+ psrldq xmm7, 12
+ pclmulqdq xmm7, xmm10, 0x10
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+ pclmulqdq xmm7, xmm10, 0x01
+ pslldq xmm7, 4
+ pclmulqdq xmm7, xmm10, 0x11
+
+ pslldq xmm7, 4
+ pxor xmm7, xmm0
+ pextrd eax, xmm7,1
+
+_cleanup:
+ ; scale the result back to 16 bits
+ shr eax, 16
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp+16*2]
+ movdqa xmm7, [rsp+16*3]
+ movdqa xmm8, [rsp+16*4]
+ movdqa xmm9, [rsp+16*5]
+ movdqa xmm10, [rsp+16*6]
+ movdqa xmm11, [rsp+16*7]
+ movdqa xmm12, [rsp+16*8]
+ movdqa xmm13, [rsp+16*9]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
+align 16
+; Q = 0x18BB70000
+; rk1 = 2^(32*3) mod Q << 32
+; rk2 = 2^(32*5) mod Q << 32
+; rk3 = 2^(32*15) mod Q << 32
+; rk4 = 2^(32*17) mod Q << 32
+; rk5 = 2^(32*3) mod Q << 32
+; rk6 = 2^(32*2) mod Q << 32
+; rk7 = floor(2^64/Q)
+; rk8 = Q
+rk1:
+DQ 0x2d56000000000000
+rk2:
+DQ 0x06df000000000000
+rk3:
+DQ 0x9d9d000000000000
+rk4:
+DQ 0x7cf5000000000000
+rk5:
+DQ 0x2d56000000000000
+rk6:
+DQ 0x1368000000000000
+rk7:
+DQ 0x00000001f65a57f8
+rk8:
+DQ 0x000000018bb70000
+
+rk9:
+DQ 0xceae000000000000
+rk10:
+DQ 0xbfd6000000000000
+rk11:
+DQ 0x1e16000000000000
+rk12:
+DQ 0x713c000000000000
+rk13:
+DQ 0xf7f9000000000000
+rk14:
+DQ 0x80a6000000000000
+rk15:
+DQ 0x044c000000000000
+rk16:
+DQ 0xe658000000000000
+rk17:
+DQ 0xad18000000000000
+rk18:
+DQ 0xa497000000000000
+rk19:
+DQ 0x6ee3000000000000
+rk20:
+DQ 0xe7b5000000000000
+
+
+
+
+
+
+
+
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+;;; func core, ver, snum
+slversion crc16_t10dif_01, 01, 06, 0010
+
diff --git a/src/isa-l/crc/crc16_t10dif_by4.asm b/src/isa-l/crc/crc16_t10dif_by4.asm
new file mode 100644
index 00000000..f79f4f6d
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_by4.asm
@@ -0,0 +1,562 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Function API:
+; UINT16 crc16_t10dif_by4(
+; UINT16 init_crc, //initial CRC value, 16 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://download.intel.com/design/intarch/papers/323102.pdf
+;
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+align 16
+global crc16_t10dif_by4:function
+crc16_t10dif_by4:
+
+ ; adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl arg1_low32, 16
+
+ ; After this point, code flow is exactly same as a 32-bit CRC.
+ ; The only difference is before returning eax, we will shift
+ ; it right 16 bits, to scale back to 16 bits.
+
+ sub rsp,16*4+8
+
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp+16*2],xmm6
+ movdqa [rsp+16*3],xmm7
+
+ ; check if smaller than 128B
+ cmp arg3, 128
+
+ ; for sizes less than 128, we can't fold 64B at a time...
+ jl _less_than_128
+
+
+ ; load the initial crc value
+ movd xmm6, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to
+ ; be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with
+ ; initial crc at correct place.
+ pslldq xmm6, 12
+
+ movdqa xmm7, [SHUF_MASK]
+ ; receive the initial 64B data, xor the initial crc value
+ movdqu xmm0, [arg2]
+ movdqu xmm1, [arg2+16]
+ movdqu xmm2, [arg2+32]
+ movdqu xmm3, [arg2+48]
+
+ pshufb xmm0, xmm7
+ ; XOR the initial_crc value
+ pxor xmm0, xmm6
+ pshufb xmm1, xmm7
+ pshufb xmm2, xmm7
+ pshufb xmm3, xmm7
+
+ movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
+ ;imm value of pclmulqdq instruction
+ ;will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 128 instead of 64 to save one instruction from the loop
+ sub arg3, 128
+
+ ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ ; buffer. The _fold_64_B_loop
+ ; loop will fold 64B at a time until we have 64+y Bytes of buffer
+
+
+ ; fold 64B at a time. This section of the code folds 4 xmm
+ ; registers in parallel
+_fold_64_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 64 ; buf += 64;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm4, xmm0
+ movdqu xmm5, xmm1
+
+ pclmulqdq xmm0, xmm6 , 0x11
+ pclmulqdq xmm1, xmm6 , 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm4, xmm2
+ movdqu xmm5, xmm3
+
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ movdqu xmm4, [arg2]
+ movdqu xmm5, [arg2+16]
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ movdqu xmm4, [arg2+32]
+ movdqu xmm5, [arg2+48]
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ sub arg3, 64
+
+ ; check if there is another 64B in the buffer to be able to fold
+ jge _fold_64_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 64
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
+
+
+ ; fold the 4 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will
+ ;determine which constant to use
+
+ movdqa xmm4, xmm0
+ pclmulqdq xmm0, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm1, xmm4
+ pxor xmm1, xmm0
+
+ movdqa xmm4, xmm1
+ pclmulqdq xmm1, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm2, xmm4
+ pxor xmm2, xmm1
+
+ movdqa xmm4, xmm2
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+
+ ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 64-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes
+ ; is in register xmm3 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm7
+ pxor xmm3, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm3 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer,
+ ; we can offset the input pointer before the actual point,
+ ; to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm3
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm7
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm3 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
+ movdqa xmm0, xmm3
+
+ ;64b fold
+ pclmulqdq xmm3, xmm6, 0x1
+ pslldq xmm0, 8
+ pxor xmm3, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm3
+
+ pand xmm0, [mask2]
+
+ psrldq xmm3, 12
+ pclmulqdq xmm3, xmm6, 0x10
+ pxor xmm3, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
+ movdqa xmm0, xmm3
+ pclmulqdq xmm3, xmm6, 0x01
+ pslldq xmm3, 4
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pslldq xmm3, 4
+ pxor xmm3, xmm0
+ pextrd eax, xmm3,1
+
+_cleanup:
+ ; scale the result back to 16 bits
+ shr eax, 16
+ movdqa xmm6, [rsp+16*2]
+ movdqa xmm7, [rsp+16*3]
+ add rsp,16*4+8
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_128:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm7, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+ movdqu xmm3, [arg2] ; load the plaintext
+ pshufb xmm3, xmm7 ; byte-reflect the plaintext
+ pxor xmm3, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm7, [SHUF_MASK]
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm3, [arg2] ; load the plaintext
+ pshufb xmm3, xmm7 ; byte-reflect the plaintext
+ pxor xmm3, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm3, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm3, [arg2]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ psrldq xmm3, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ psrldq xmm3, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ psrldq xmm3, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
+align 16
+; Q = 0x18BB70000
+; rk1 = 2^(32*3) mod Q << 32
+; rk2 = 2^(32*5) mod Q << 32
+; rk3 = 2^(32*15) mod Q << 32
+; rk4 = 2^(32*17) mod Q << 32
+; rk5 = 2^(32*3) mod Q << 32
+; rk6 = 2^(32*2) mod Q << 32
+; rk7 = floor(2^64/Q)
+; rk8 = Q
+rk1:
+DQ 0x2d56000000000000
+rk2:
+DQ 0x06df000000000000
+rk3:
+DQ 0x044c000000000000
+rk4:
+DQ 0xe658000000000000
+rk5:
+DQ 0x2d56000000000000
+rk6:
+DQ 0x1368000000000000
+rk7:
+DQ 0x00000001f65a57f8
+rk8:
+DQ 0x000000018bb70000
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+;;; func core, ver, snum
+slversion crc16_t10dif_by4, 05, 02, 0016
diff --git a/src/isa-l/crc/crc16_t10dif_perf.c b/src/isa-l/crc/crc16_t10dif_perf.c
new file mode 100644
index 00000000..34f1ddbd
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_perf.c
@@ -0,0 +1,87 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+ int i;
+ void *buf;
+ uint16_t crc;
+ struct perf start, stop;
+
+ printf("crc16_t10dif_perf:\n");
+
+ if (posix_memalign(&buf, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ memset(buf, 0, TEST_LEN);
+ crc = crc16_t10dif(TEST_SEED, buf, TEST_LEN);
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ crc = crc16_t10dif(TEST_SEED, buf, TEST_LEN);
+ }
+ perf_stop(&stop);
+ printf("crc16_t10dif" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ printf("finish 0x%x\n", crc);
+ return 0;
+}
diff --git a/src/isa-l/crc/crc16_t10dif_test.c b/src/isa-l/crc/crc16_t10dif_test.c
new file mode 100644
index 00000000..e622f10a
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_test.c
@@ -0,0 +1,167 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "crc.h"
+#include "types.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define MAX_BUF 512
+#define TEST_SIZE 20
+
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0;
+ u32 r = 0;
+ int verbose = argc - 1;
+ int i, s;
+ void *buf_raw;
+ unsigned char *buf;
+
+ printf("Test crc16_t10dif_test ");
+ if (posix_memalign(&buf_raw, MAX_BUF, MAX_BUF * TEST_SIZE)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ buf = (unsigned char *)buf_raw;
+
+ srand(TEST_SEED);
+
+ // Test of all zeros
+ memset(buf, 0, MAX_BUF * 10);
+ u16 crc = crc16_t10dif(TEST_SEED, buf, MAX_BUF);
+ u16 crc_ref = crc16_t10dif_base(TEST_SEED, buf, MAX_BUF);
+ if (crc != crc_ref) {
+ fail++;
+ printf("\n opt ref\n");
+ printf(" ------ ------\n");
+ printf("crc zero = 0x%4x 0x%4x \n", crc, crc_ref);
+ } else
+ printf(".");
+
+ // Another simple test pattern
+ memset(buf, 0x8a, MAX_BUF);
+ crc = crc16_t10dif(TEST_SEED, buf, MAX_BUF);
+ crc_ref = crc16_t10dif_base(TEST_SEED, buf, MAX_BUF);
+ if (crc != crc_ref) {
+ fail++;
+ printf("crc all 8a = 0x%4x 0x%4x\n", crc, crc_ref);
+ } else
+ printf(".");
+
+ // Do a few random tests
+
+ rand_buffer(buf, MAX_BUF * TEST_SIZE);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = crc16_t10dif(TEST_SEED, buf, MAX_BUF);
+ crc_ref = crc16_t10dif_base(TEST_SEED, buf, MAX_BUF);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%4x 0x%4x\n", i, crc, crc_ref);
+ else
+ printf(".");
+ buf += MAX_BUF;
+ }
+
+ // Do a few random sizes
+ buf = (unsigned char *)buf_raw; //reset buf
+ r = rand();
+
+ for (i = MAX_BUF; i >= 0; i--) {
+ crc = crc16_t10dif(r, buf, i);
+ crc_ref = crc16_t10dif_base(r, buf, i);
+ if (crc != crc_ref) {
+ fail++;
+ printf("fail random size%i 0x%8x 0x%8x\n", i, crc, crc_ref);
+ } else
+ printf(".");
+ }
+
+ // Try different seeds
+ for (s = 0; s < 20; s++) {
+ buf = (unsigned char *)buf_raw; //reset buf
+
+ r = rand(); // just to get a new seed
+ rand_buffer(buf, MAX_BUF * TEST_SIZE); // new pseudo-rand data
+
+ if (verbose)
+ printf("seed = 0x%x\n", r);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = crc16_t10dif(r, buf, MAX_BUF);
+ crc_ref = crc16_t10dif_base(r, buf, MAX_BUF);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%4x 0x%4x\n", i, crc, crc_ref);
+ else
+ printf(".");
+ buf += MAX_BUF;
+ }
+ }
+
+ // Run tests at end of buffer
+ buf = (unsigned char *)buf_raw; //reset buf
+ buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = crc16_t10dif(TEST_SEED, buf + i, TEST_SIZE - i);
+ crc_ref = crc16_t10dif_base(TEST_SEED, buf + i, TEST_SIZE - i);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc eob rand%3d = 0x%4x 0x%4x\n", i, crc, crc_ref);
+ else
+ printf(".");
+ }
+
+ printf("Test done: %s\n", fail ? "Fail" : "Pass");
+ if (fail)
+ printf("\nFailed %d tests\n", fail);
+
+ return fail;
+}
diff --git a/src/isa-l/crc/crc32_ieee_01.asm b/src/isa-l/crc/crc32_ieee_01.asm
new file mode 100644
index 00000000..3c463dad
--- /dev/null
+++ b/src/isa-l/crc/crc32_ieee_01.asm
@@ -0,0 +1,655 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; UINT32 crc32_ieee_01(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+align 16
+global crc32_ieee_01:function
+crc32_ieee_01:
+
+ not arg1_low32 ;~init_crc
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movd xmm10, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 12
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x1
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm7
+
+ pand xmm0, [mask2]
+
+ psrldq xmm7, 12
+ pclmulqdq xmm7, xmm10, 0x10
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+ pclmulqdq xmm7, xmm10, 0x01
+ pslldq xmm7, 4
+ pclmulqdq xmm7, xmm10, 0x11
+
+ pslldq xmm7, 4
+ pxor xmm7, xmm0
+ pextrd eax, xmm7,1
+
+_cleanup:
+ not eax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp,VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+align 16
+
+rk1 :
+DQ 0xf200aa6600000000
+rk2 :
+DQ 0x17d3315d00000000
+rk3 :
+DQ 0x022ffca500000000
+rk4 :
+DQ 0x9d9ee22f00000000
+rk5 :
+DQ 0xf200aa6600000000
+rk6 :
+DQ 0x490d678d00000000
+rk7 :
+DQ 0x0000000104d101df
+rk8 :
+DQ 0x0000000104c11db7
+rk9 :
+DQ 0x6ac7e7d700000000
+rk10 :
+DQ 0xfcd922af00000000
+rk11 :
+DQ 0x34e45a6300000000
+rk12 :
+DQ 0x8762c1f600000000
+rk13 :
+DQ 0x5395a0ea00000000
+rk14 :
+DQ 0x54f2d5c700000000
+rk15 :
+DQ 0xd3504ec700000000
+rk16 :
+DQ 0x57a8445500000000
+rk17 :
+DQ 0xc053585d00000000
+rk18 :
+DQ 0x766f1b7800000000
+rk19 :
+DQ 0xcd8c54b500000000
+rk20 :
+DQ 0xab40b71e00000000
+
+
+
+
+
+
+
+
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+;;; func core, ver, snum
+slversion crc32_ieee_01, 01, 06, 0011
+
diff --git a/src/isa-l/crc/crc32_ieee_by4.asm b/src/isa-l/crc/crc32_ieee_by4.asm
new file mode 100644
index 00000000..97f68680
--- /dev/null
+++ b/src/isa-l/crc/crc32_ieee_by4.asm
@@ -0,0 +1,565 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Function API:
+; UINT32 crc32_ieee_by4(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://download.intel.com/design/intarch/papers/323102.pdf
+;
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*4+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+global crc32_ieee_by4:function
+crc32_ieee_by4:
+
+ not arg1_low32
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0],xmm6
+ movdqa [rsp + XMM_SAVE + 16*1],xmm7
+%endif
+
+ ; check if smaller than 128B
+ cmp arg3, 128
+ jl _less_than_128
+
+
+
+ ; load the initial crc value
+ movd xmm6, arg1_low32 ; initial crc
+ ; crc value does not need to be byte-reflected, but it needs to be
+ ; moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial
+ ; crc at correct place.
+ pslldq xmm6, 12
+
+
+
+ movdqa xmm7, [SHUF_MASK]
+ ; receive the initial 64B data, xor the initial crc value
+ movdqu xmm0, [arg2]
+ movdqu xmm1, [arg2+16]
+ movdqu xmm2, [arg2+32]
+ movdqu xmm3, [arg2+48]
+
+
+
+ pshufb xmm0, xmm7
+ ; XOR the initial_crc value
+ pxor xmm0, xmm6
+ pshufb xmm1, xmm7
+ pshufb xmm2, xmm7
+ pshufb xmm3, xmm7
+
+ movdqa xmm6, [rk3] ; k3=2^480 mod POLY << 32
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;we subtract 128 instead of 64 to save one instruction from the loop
+ sub arg3, 128
+
+ ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ ; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
+ ; have 64+y Bytes of buffer
+
+
+ ; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
+_fold_64_B_loop:
+
+ ;update the buffer pointer
+ add arg2, 64
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+
+ pclmulqdq xmm0, xmm6 , 0x11
+ pclmulqdq xmm1, xmm6 , 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ movdqu xmm4, [arg2]
+ movdqu xmm5, [arg2+16]
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ movdqu xmm4, [arg2+32]
+ movdqu xmm5, [arg2+48]
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ sub arg3, 64
+
+ ; check if there is another 64B in the buffer to be able to fold
+ jge _fold_64_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 64
+ ;at this point, the arg2 is pointing at the last y Bytes of the buffer
+ ; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
+
+
+ movdqa xmm6, [rk1] ;k1
+
+ ; fold the 4 xmm registers to 1 xmm register with different constants
+ movdqa xmm4, xmm0
+ pclmulqdq xmm0, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm1, xmm4
+ xorps xmm1, xmm0
+
+ movdqa xmm4, xmm1
+ pclmulqdq xmm1, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm2, xmm4
+ xorps xmm2, xmm1
+
+ movdqa xmm4, xmm2
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+
+ ;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 64-16
+ jl _final_reduction_for_128
+
+; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
+; we can fold 16 bytes at a time if y>=16
+; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm7
+ pxor xmm3, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm3 register
+
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset
+ ; the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm3
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm7
+
+ shl arg3, 4
+ lea rax, [pshufb_shf_table + 15*16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ pshufb xmm2, xmm0
+
+ pxor xmm0, [mask3]
+
+ pshufb xmm3, xmm0
+
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ movdqa xmm2, xmm1
+
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+_128_done:
+
+ movdqa xmm6, [rk5]
+ movdqa xmm0, xmm3
+
+ ;64b fold
+ pclmulqdq xmm3, xmm6, 0x1
+ pslldq xmm0, 8
+ pxor xmm3, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm3
+
+ pand xmm0, [mask4]
+
+ psrldq xmm3, 12
+ pclmulqdq xmm3, xmm6, 0x10
+ pxor xmm3, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm6, [rk7]
+ movdqa xmm0, xmm3
+ pclmulqdq xmm3, xmm6, 0x01
+ pslldq xmm3, 4
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pslldq xmm3, 4
+ pxor xmm3, xmm0
+ pextrd eax, xmm3,1
+
+_cleanup:
+ not eax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+%endif
+ add rsp,VARIABLE_OFFSET
+
+
+ ret
+
+
+
+
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_128:
+
+ ;check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm7, [SHUF_MASK]
+
+ ;if there is, load the constants
+ movdqa xmm6, [rk1] ;k1
+
+ movd xmm0, arg1_low32
+ pslldq xmm0, 12
+ movdqu xmm3, [arg2]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+
+ ;update the buffer pointer
+ add arg2, 16
+
+ ;update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm7, [SHUF_MASK]
+
+ movd xmm0, arg1_low32
+ pslldq xmm0, 12
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+ movd xmm0, arg1_low32
+ pslldq xmm0, 12
+ movdqu xmm3, [arg2]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm6, [rk1] ;k1
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ mov r9, arg3
+
+
+ cmp arg3, 8
+ jl _less_than_8_left
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ shl r9, 4
+ lea rax, [pshufb_shf_table + 15*16]
+ sub rax, r9
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask3]
+
+ pshufb xmm3, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm3, [arg2]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ psrldq xmm3, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ psrldq xmm3, 6
+
+ jmp _barrett
+_only_less_than_2:
+ mov al, [arg2]
+ mov [r11], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ psrldq xmm3, 7
+
+ jmp _barrett
+; precomputed constants
+section .data
+
+align 16
+rk1:
+DQ 0xf200aa6600000000
+rk2:
+DQ 0x17d3315d00000000
+rk3:
+DQ 0xd3504ec700000000
+rk4:
+DQ 0x57a8445500000000
+rk5:
+DQ 0xf200aa6600000000
+rk6:
+DQ 0x490d678d00000000
+rk7:
+DQ 0x0000000104d101df
+rk8:
+DQ 0x0000000104c11db7
+mask:
+dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2:
+dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3:
+dq 0x8080808080808080, 0x8080808080808080
+mask4:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+ align 32
+pshufb_shf_table:
+
+ dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+
+ dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+
+ dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+
+ dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+
+ dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+
+ dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+
+ dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+
+ dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+
+ dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+
+ dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+
+ dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+
+ dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+
+ dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+
+ dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+
+ dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+
+
+SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+;;; func core, ver, snum
+slversion crc32_ieee_by4, 05, 02, 0017
diff --git a/src/isa-l/crc/crc32_ieee_perf.c b/src/isa-l/crc/crc32_ieee_perf.c
new file mode 100644
index 00000000..c6c74950
--- /dev/null
+++ b/src/isa-l/crc/crc32_ieee_perf.c
@@ -0,0 +1,87 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+ int i;
+ void *buf;
+ uint32_t crc;
+ struct perf start, stop;
+
+ printf("crc32_ieee_perf:\n");
+
+ if (posix_memalign(&buf, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ memset(buf, 0, TEST_LEN);
+ crc = crc32_ieee(TEST_SEED, buf, TEST_LEN);
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ crc = crc32_ieee(TEST_SEED, buf, TEST_LEN);
+ }
+ perf_stop(&stop);
+ printf("crc32_ieee" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ printf("finish 0x%x\n", crc);
+ return 0;
+}
diff --git a/src/isa-l/crc/crc32_ieee_test.c b/src/isa-l/crc/crc32_ieee_test.c
new file mode 100644
index 00000000..4dfbc005
--- /dev/null
+++ b/src/isa-l/crc/crc32_ieee_test.c
@@ -0,0 +1,174 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "crc.h"
+#include "types.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define MAX_BUF 512
+#define TEST_SIZE 20
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0;
+ u32 r;
+ int verbose = argc - 1;
+ int i, s, ret;
+ void *buf_alloc;
+ unsigned char *buf;
+
+ printf("Test crc32_ieee ");
+
+ // Align to MAX_BUF boundary
+ ret = posix_memalign(&buf_alloc, MAX_BUF, MAX_BUF * TEST_SIZE);
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ buf = (unsigned char *)buf_alloc;
+
+ srand(TEST_SEED);
+
+ // Test of all zeros
+ memset(buf, 0, MAX_BUF * 10);
+ u32 crc = crc32_ieee(TEST_SEED, buf, MAX_BUF);
+ u32 crc_ref = crc32_ieee_base(TEST_SEED, buf, MAX_BUF);
+ if (crc != crc_ref) {
+ fail++;
+ printf("\n opt ref\n");
+ printf(" ------ ------\n");
+ printf("crc zero = 0x%8x 0x%8x \n", crc, crc_ref);
+ } else
+ printf(".");
+
+ // Another simple test pattern
+ memset(buf, 0x8a, MAX_BUF);
+ crc = crc32_ieee(TEST_SEED, buf, MAX_BUF);
+ crc_ref = crc32_ieee_base(TEST_SEED, buf, MAX_BUF);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc all 8a = 0x%8x 0x%8x\n", crc, crc_ref);
+ else
+ printf(".");
+
+ // Do a few random tests
+ r = rand();
+ rand_buffer(buf, MAX_BUF * TEST_SIZE);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = crc32_ieee(r, buf, MAX_BUF);
+ crc_ref = crc32_ieee_base(r, buf, MAX_BUF);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%8x 0x%8x\n", i, crc, crc_ref);
+ else
+ printf(".");
+ buf += MAX_BUF;
+ }
+
+ // Do a few random sizes
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+
+ for (i = MAX_BUF; i >= 0; i--) {
+ crc = crc32_ieee(r, buf, i);
+ crc_ref = crc32_ieee_base(r, buf, i);
+ if (crc != crc_ref) {
+ fail++;
+ printf("fail random size%i 0x%8x 0x%8x\n", i, crc, crc_ref);
+ } else
+ printf(".");
+ }
+
+ // Try different seeds
+ for (s = 0; s < 20; s++) {
+ buf = (unsigned char *)buf_alloc; //reset buf
+
+ r = rand(); // just to get a new seed
+ rand_buffer(buf, MAX_BUF * TEST_SIZE); // new pseudo-rand data
+
+ if (verbose)
+ printf("seed = 0x%x\n", r);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = crc32_ieee(r, buf, MAX_BUF);
+ crc_ref = crc32_ieee_base(r, buf, MAX_BUF);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%8x 0x%8x\n", i, crc, crc_ref);
+ else
+ printf(".");
+ buf += MAX_BUF;
+ }
+ }
+
+ // Run tests at end of buffer
+ buf = (unsigned char *)buf_alloc; //reset buf
+ buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = crc32_ieee(TEST_SEED, buf + i, TEST_SIZE - i);
+ crc_ref = crc32_ieee_base(TEST_SEED, buf + i, TEST_SIZE - i);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc eob rand%3d = 0x%4x 0x%4x\n", i, crc, crc_ref);
+ else
+ printf(".");
+ }
+
+ printf("Test done: %s\n", fail ? "Fail" : "Pass");
+ if (fail)
+ printf("\nFailed %d tests\n", fail);
+
+ return fail;
+}
diff --git a/src/isa-l/crc/crc32_iscsi_00.asm b/src/isa-l/crc/crc32_iscsi_00.asm
new file mode 100644
index 00000000..2833a8d0
--- /dev/null
+++ b/src/isa-l/crc/crc32_iscsi_00.asm
@@ -0,0 +1,656 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function to compute iscsi CRC32 with table-based recombination
+; crc done "by 3" with block sizes 1920, 960, 480, 240
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+default rel
+; crcB3 MACRO to implement crc32 on 3 %%bSize-byte blocks
+%macro crcB3 3
+%define %%bSize %1 ; 1/3 of buffer size
+%define %%td2 %2 ; table offset for crc0 (2/3 of buffer)
+%define %%td1 %3 ; table offset for crc1 (1/3 of buffer)
+
+%IF %%bSize=640
+ sub len, %%bSize*3
+ js %%crcB3_end ;; jump to next level if 3*blockSize > len
+%ELSE
+ cmp len, %%bSize*3
+ jnae %%crcB3_end ;; jump to next level if 3*blockSize > len
+%ENDIF
+ ;;;;;; Calculate CRC of 3 blocks of the buffer ;;;;;;
+%%crcB3_loop:
+ ;; rax = crc0 = initial crc
+ xor rbx, rbx ;; rbx = crc1 = 0;
+ xor r10, r10 ;; r10 = crc2 = 0;
+
+ %assign i 0
+ %rep %%bSize/8 - 1
+ %if i < %%bSize*3/4
+ prefetchnta [bufptmp+ %%bSize*3 +i*4]
+ %endif
+ crc32 rax, qword [bufptmp+i + 0*%%bSize] ;; update crc0
+ crc32 rbx, qword [bufptmp+i + 1*%%bSize] ;; update crc1
+ crc32 r10, qword [bufptmp+i + 2*%%bSize] ;; update crc2
+ %assign i (i+8)
+ %endrep
+ crc32 rax, qword [bufptmp+i + 0*%%bSize] ;; update crc0
+ crc32 rbx, qword [bufptmp+i + 1*%%bSize] ;; update crc1
+; SKIP ;crc32 r10, [bufptmp+i + 2*%%bSize] ;; update crc2
+
+ ; merge in crc0
+ movzx bufp_dw, al
+ mov r9d, [crc_init + bufp*4 + %%td2]
+ movzx bufp_dw, ah
+ shr eax, 16
+ mov r11d, [crc_init + bufp*4 + %%td2]
+ shl r11, 8
+ xor r9, r11
+
+ movzx bufp_dw, al
+ mov r11d, [crc_init + bufp*4 + %%td2]
+ movzx bufp_dw, ah
+ shl r11, 16
+ xor r9, r11
+ mov r11d, [crc_init + bufp*4 + %%td2]
+ shl r11, 24
+ xor r9, r11
+
+ ; merge in crc1
+
+ movzx bufp_dw, bl
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ movzx bufp_dw, bh
+ shr ebx, 16
+ xor r9, r11
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ shl r11, 8
+ xor r9, r11
+
+ movzx bufp_dw, bl
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ movzx bufp_dw, bh
+ shl r11, 16
+ xor r9, r11
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ shl r11, 24
+ xor r9, r11
+
+ xor r9, [bufptmp+i + 2*%%bSize]
+ crc32 r10, r9
+ mov rax, r10
+
+ add bufptmp, %%bSize*3 ;; move to next block
+ sub len, %%bSize*3
+%IF %%bSize=640
+ jns %%crcB3_loop
+%ENDIF
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%crcB3_end:
+%IF %%bSize=640
+ add len, %%bSize*3
+%ENDIF
+ je do_return ;; return if remaining data is zero
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ISCSI CRC 32 Implementation with crc32 Instruction
+
+;;; unsigned int crc32_iscsi_00(unsigned char * buffer, int len, unsigned int crc_init);
+;;;
+;;; *buf = rcx
+;;; len = rdx
+;;; crc_init = r8
+;;;
+
+global crc32_iscsi_00:function
+crc32_iscsi_00:
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define bufp rdi
+%define bufp_dw edi
+%define bufp_w di
+%define bufp_b dil
+%define bufptmp rcx
+%define block_0 rcx
+%define block_1 r8
+%define block_2 r11
+%define len rsi
+%define len_dw esi
+%define len_w si
+%define len_b sil
+%define crc_init rdx
+%define crc_init_dw edx
+%else
+%define bufp rcx
+%define bufp_dw ecx
+%define bufp_w cx
+%define bufp_b cl
+%define bufptmp rdi
+%define block_0 rdi
+%define block_1 rsi
+%define block_2 r11
+%define len rdx
+%define len_dw edx
+%define len_w dx
+%define len_b dl
+%define crc_init r8
+%define crc_init_dw r8d
+%endif
+
+
+ push rdi
+ push rbx
+
+ mov rax, crc_init ;; rax = crc_init;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov bufptmp, bufp ;; rdi = *buf
+ neg bufp
+ and bufp, 7 ;; calculate the unalignment
+ ;; amount of the address
+ je proc_block ;; Skip if aligned
+
+ cmp len, 8
+ jb less_than_8
+
+ ;;;; Calculate CRC of unaligned bytes of the buffer (if any) ;;;;
+ mov rbx, [bufptmp] ;; load a quadword from the buffer
+ add bufptmp, bufp ;; align buffer pointer for
+ ;; quadword processing
+ sub len, bufp ;; update buffer length
+align_loop:
+ crc32 eax, bl ;; compute crc32 of 1-byte
+ shr rbx, 8 ;; get next byte
+ dec bufp
+ jne align_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 2) BLOCK LEVEL: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+proc_block:
+ cmp len, 240
+ jb bit8
+
+ lea crc_init, [mul_table_72] ;; load table base address
+
+ crcB3 640, 0x1000, 0x0c00 ; 640*3 = 1920 (Tables 1280, 640)
+ crcB3 320, 0x0c00, 0x0800 ; 320*3 = 960 (Tables 640, 320)
+ crcB3 160, 0x0800, 0x0400 ; 160*3 = 480 (Tables 320, 160)
+ crcB3 80, 0x0400, 0x0000 ; 80*3 = 240 (Tables 160, 80)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of rdx are full)
+
+bit8:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit7 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 16
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 128 ;; buf +=64; (next 64 bytes)
+
+bit7:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit6 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 8
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 64 ;; buf +=64; (next 64 bytes)
+bit6:
+ shl len_b, 1 ;; shift-out MSB (bit-6)
+ jnc bit5 ;; jump to bit-5 if bit-6 == 0
+ %assign i 0
+ %rep 4
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 32 ;; buf +=32; (next 32 bytes)
+bit5:
+ shl len_b, 1 ;; shift-out MSB (bit-5)
+ jnc bit4 ;; jump to bit-4 if bit-5 == 0
+ %assign i 0
+ %rep 2
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 16 ;; buf +=16; (next 16 bytes)
+bit4:
+ shl len_b, 1 ;; shift-out MSB (bit-4)
+ jnc bit3 ;; jump to bit-3 if bit-4 == 0
+ crc32 rax, qword [bufptmp] ;; compute crc32 of 8-byte data
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 8 ;; buf +=8; (next 8 bytes)
+bit3:
+ mov rbx, qword [bufptmp] ;; load a 8-bytes from the buffer:
+ shl len_b, 1 ;; shift-out MSB (bit-3)
+ jnc bit2 ;; jump to bit-2 if bit-3 == 0
+ crc32 eax, ebx ;; compute crc32 of 4-byte data
+ je do_return ;; return if remaining data is zero
+ shr rbx, 32 ;; get next 3 bytes
+bit2:
+ shl len_b, 1 ;; shift-out MSB (bit-2)
+ jnc bit1 ;; jump to bit-1 if bit-2 == 0
+ crc32 eax, bx ;; compute crc32 of 2-byte data
+ je do_return ;; return if remaining data is zero
+ shr rbx, 16 ;; next byte
+bit1:
+ test len_b,len_b
+ je do_return
+ crc32 eax, bl ;; compute crc32 of 1-byte data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+do_return:
+
+ pop rbx
+ pop rdi
+ ret
+
+less_than_8:
+ test len,4
+ jz less_than_4
+ crc32 eax, dword[bufptmp]
+ add bufptmp,4
+less_than_4:
+ test len,2
+ jz less_than_2
+ crc32 eax, word[bufptmp]
+ add bufptmp,2
+less_than_2:
+ test len,1
+ jz do_return
+ crc32 rax, byte[bufptmp]
+ pop rbx
+ pop bufptmp
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; global mul_table_72, mul_table_152, mul_table_312, mul_table_632, mul_table_1272
+
+section .data
+align 8
+mul_table_72:
+DD 0x00000000,0x39d3b296,0x73a7652c,0x4a74d7ba
+DD 0xe74eca58,0xde9d78ce,0x94e9af74,0xad3a1de2
+DD 0xcb71e241,0xf2a250d7,0xb8d6876d,0x810535fb
+DD 0x2c3f2819,0x15ec9a8f,0x5f984d35,0x664bffa3
+DD 0x930fb273,0xaadc00e5,0xe0a8d75f,0xd97b65c9
+DD 0x7441782b,0x4d92cabd,0x07e61d07,0x3e35af91
+DD 0x587e5032,0x61ade2a4,0x2bd9351e,0x120a8788
+DD 0xbf309a6a,0x86e328fc,0xcc97ff46,0xf5444dd0
+DD 0x23f31217,0x1a20a081,0x5054773b,0x6987c5ad
+DD 0xc4bdd84f,0xfd6e6ad9,0xb71abd63,0x8ec90ff5
+DD 0xe882f056,0xd15142c0,0x9b25957a,0xa2f627ec
+DD 0x0fcc3a0e,0x361f8898,0x7c6b5f22,0x45b8edb4
+DD 0xb0fca064,0x892f12f2,0xc35bc548,0xfa8877de
+DD 0x57b26a3c,0x6e61d8aa,0x24150f10,0x1dc6bd86
+DD 0x7b8d4225,0x425ef0b3,0x082a2709,0x31f9959f
+DD 0x9cc3887d,0xa5103aeb,0xef64ed51,0xd6b75fc7
+DD 0x47e6242e,0x7e3596b8,0x34414102,0x0d92f394
+DD 0xa0a8ee76,0x997b5ce0,0xd30f8b5a,0xeadc39cc
+DD 0x8c97c66f,0xb54474f9,0xff30a343,0xc6e311d5
+DD 0x6bd90c37,0x520abea1,0x187e691b,0x21addb8d
+DD 0xd4e9965d,0xed3a24cb,0xa74ef371,0x9e9d41e7
+DD 0x33a75c05,0x0a74ee93,0x40003929,0x79d38bbf
+DD 0x1f98741c,0x264bc68a,0x6c3f1130,0x55eca3a6
+DD 0xf8d6be44,0xc1050cd2,0x8b71db68,0xb2a269fe
+DD 0x64153639,0x5dc684af,0x17b25315,0x2e61e183
+DD 0x835bfc61,0xba884ef7,0xf0fc994d,0xc92f2bdb
+DD 0xaf64d478,0x96b766ee,0xdcc3b154,0xe51003c2
+DD 0x482a1e20,0x71f9acb6,0x3b8d7b0c,0x025ec99a
+DD 0xf71a844a,0xcec936dc,0x84bde166,0xbd6e53f0
+DD 0x10544e12,0x2987fc84,0x63f32b3e,0x5a2099a8
+DD 0x3c6b660b,0x05b8d49d,0x4fcc0327,0x761fb1b1
+DD 0xdb25ac53,0xe2f61ec5,0xa882c97f,0x91517be9
+DD 0x8fcc485c,0xb61ffaca,0xfc6b2d70,0xc5b89fe6
+DD 0x68828204,0x51513092,0x1b25e728,0x22f655be
+DD 0x44bdaa1d,0x7d6e188b,0x371acf31,0x0ec97da7
+DD 0xa3f36045,0x9a20d2d3,0xd0540569,0xe987b7ff
+DD 0x1cc3fa2f,0x251048b9,0x6f649f03,0x56b72d95
+DD 0xfb8d3077,0xc25e82e1,0x882a555b,0xb1f9e7cd
+DD 0xd7b2186e,0xee61aaf8,0xa4157d42,0x9dc6cfd4
+DD 0x30fcd236,0x092f60a0,0x435bb71a,0x7a88058c
+DD 0xac3f5a4b,0x95ece8dd,0xdf983f67,0xe64b8df1
+DD 0x4b719013,0x72a22285,0x38d6f53f,0x010547a9
+DD 0x674eb80a,0x5e9d0a9c,0x14e9dd26,0x2d3a6fb0
+DD 0x80007252,0xb9d3c0c4,0xf3a7177e,0xca74a5e8
+DD 0x3f30e838,0x06e35aae,0x4c978d14,0x75443f82
+DD 0xd87e2260,0xe1ad90f6,0xabd9474c,0x920af5da
+DD 0xf4410a79,0xcd92b8ef,0x87e66f55,0xbe35ddc3
+DD 0x130fc021,0x2adc72b7,0x60a8a50d,0x597b179b
+DD 0xc82a6c72,0xf1f9dee4,0xbb8d095e,0x825ebbc8
+DD 0x2f64a62a,0x16b714bc,0x5cc3c306,0x65107190
+DD 0x035b8e33,0x3a883ca5,0x70fceb1f,0x492f5989
+DD 0xe415446b,0xddc6f6fd,0x97b22147,0xae6193d1
+DD 0x5b25de01,0x62f66c97,0x2882bb2d,0x115109bb
+DD 0xbc6b1459,0x85b8a6cf,0xcfcc7175,0xf61fc3e3
+DD 0x90543c40,0xa9878ed6,0xe3f3596c,0xda20ebfa
+DD 0x771af618,0x4ec9448e,0x04bd9334,0x3d6e21a2
+DD 0xebd97e65,0xd20accf3,0x987e1b49,0xa1ada9df
+DD 0x0c97b43d,0x354406ab,0x7f30d111,0x46e36387
+DD 0x20a89c24,0x197b2eb2,0x530ff908,0x6adc4b9e
+DD 0xc7e6567c,0xfe35e4ea,0xb4413350,0x8d9281c6
+DD 0x78d6cc16,0x41057e80,0x0b71a93a,0x32a21bac
+DD 0x9f98064e,0xa64bb4d8,0xec3f6362,0xd5ecd1f4
+DD 0xb3a72e57,0x8a749cc1,0xc0004b7b,0xf9d3f9ed
+DD 0x54e9e40f,0x6d3a5699,0x274e8123,0x1e9d33b5
+
+mul_table_152:
+DD 0x00000000,0x878a92a7,0x0af953bf,0x8d73c118
+DD 0x15f2a77e,0x927835d9,0x1f0bf4c1,0x98816666
+DD 0x2be54efc,0xac6fdc5b,0x211c1d43,0xa6968fe4
+DD 0x3e17e982,0xb99d7b25,0x34eeba3d,0xb364289a
+DD 0x57ca9df8,0xd0400f5f,0x5d33ce47,0xdab95ce0
+DD 0x42383a86,0xc5b2a821,0x48c16939,0xcf4bfb9e
+DD 0x7c2fd304,0xfba541a3,0x76d680bb,0xf15c121c
+DD 0x69dd747a,0xee57e6dd,0x632427c5,0xe4aeb562
+DD 0xaf953bf0,0x281fa957,0xa56c684f,0x22e6fae8
+DD 0xba679c8e,0x3ded0e29,0xb09ecf31,0x37145d96
+DD 0x8470750c,0x03fae7ab,0x8e8926b3,0x0903b414
+DD 0x9182d272,0x160840d5,0x9b7b81cd,0x1cf1136a
+DD 0xf85fa608,0x7fd534af,0xf2a6f5b7,0x752c6710
+DD 0xedad0176,0x6a2793d1,0xe75452c9,0x60dec06e
+DD 0xd3bae8f4,0x54307a53,0xd943bb4b,0x5ec929ec
+DD 0xc6484f8a,0x41c2dd2d,0xccb11c35,0x4b3b8e92
+DD 0x5ac60111,0xdd4c93b6,0x503f52ae,0xd7b5c009
+DD 0x4f34a66f,0xc8be34c8,0x45cdf5d0,0xc2476777
+DD 0x71234fed,0xf6a9dd4a,0x7bda1c52,0xfc508ef5
+DD 0x64d1e893,0xe35b7a34,0x6e28bb2c,0xe9a2298b
+DD 0x0d0c9ce9,0x8a860e4e,0x07f5cf56,0x807f5df1
+DD 0x18fe3b97,0x9f74a930,0x12076828,0x958dfa8f
+DD 0x26e9d215,0xa16340b2,0x2c1081aa,0xab9a130d
+DD 0x331b756b,0xb491e7cc,0x39e226d4,0xbe68b473
+DD 0xf5533ae1,0x72d9a846,0xffaa695e,0x7820fbf9
+DD 0xe0a19d9f,0x672b0f38,0xea58ce20,0x6dd25c87
+DD 0xdeb6741d,0x593ce6ba,0xd44f27a2,0x53c5b505
+DD 0xcb44d363,0x4cce41c4,0xc1bd80dc,0x4637127b
+DD 0xa299a719,0x251335be,0xa860f4a6,0x2fea6601
+DD 0xb76b0067,0x30e192c0,0xbd9253d8,0x3a18c17f
+DD 0x897ce9e5,0x0ef67b42,0x8385ba5a,0x040f28fd
+DD 0x9c8e4e9b,0x1b04dc3c,0x96771d24,0x11fd8f83
+DD 0xb58c0222,0x32069085,0xbf75519d,0x38ffc33a
+DD 0xa07ea55c,0x27f437fb,0xaa87f6e3,0x2d0d6444
+DD 0x9e694cde,0x19e3de79,0x94901f61,0x131a8dc6
+DD 0x8b9beba0,0x0c117907,0x8162b81f,0x06e82ab8
+DD 0xe2469fda,0x65cc0d7d,0xe8bfcc65,0x6f355ec2
+DD 0xf7b438a4,0x703eaa03,0xfd4d6b1b,0x7ac7f9bc
+DD 0xc9a3d126,0x4e294381,0xc35a8299,0x44d0103e
+DD 0xdc517658,0x5bdbe4ff,0xd6a825e7,0x5122b740
+DD 0x1a1939d2,0x9d93ab75,0x10e06a6d,0x976af8ca
+DD 0x0feb9eac,0x88610c0b,0x0512cd13,0x82985fb4
+DD 0x31fc772e,0xb676e589,0x3b052491,0xbc8fb636
+DD 0x240ed050,0xa38442f7,0x2ef783ef,0xa97d1148
+DD 0x4dd3a42a,0xca59368d,0x472af795,0xc0a06532
+DD 0x58210354,0xdfab91f3,0x52d850eb,0xd552c24c
+DD 0x6636ead6,0xe1bc7871,0x6ccfb969,0xeb452bce
+DD 0x73c44da8,0xf44edf0f,0x793d1e17,0xfeb78cb0
+DD 0xef4a0333,0x68c09194,0xe5b3508c,0x6239c22b
+DD 0xfab8a44d,0x7d3236ea,0xf041f7f2,0x77cb6555
+DD 0xc4af4dcf,0x4325df68,0xce561e70,0x49dc8cd7
+DD 0xd15deab1,0x56d77816,0xdba4b90e,0x5c2e2ba9
+DD 0xb8809ecb,0x3f0a0c6c,0xb279cd74,0x35f35fd3
+DD 0xad7239b5,0x2af8ab12,0xa78b6a0a,0x2001f8ad
+DD 0x9365d037,0x14ef4290,0x999c8388,0x1e16112f
+DD 0x86977749,0x011de5ee,0x8c6e24f6,0x0be4b651
+DD 0x40df38c3,0xc755aa64,0x4a266b7c,0xcdacf9db
+DD 0x552d9fbd,0xd2a70d1a,0x5fd4cc02,0xd85e5ea5
+DD 0x6b3a763f,0xecb0e498,0x61c32580,0xe649b727
+DD 0x7ec8d141,0xf94243e6,0x743182fe,0xf3bb1059
+DD 0x1715a53b,0x909f379c,0x1decf684,0x9a666423
+DD 0x02e70245,0x856d90e2,0x081e51fa,0x8f94c35d
+DD 0x3cf0ebc7,0xbb7a7960,0x3609b878,0xb1832adf
+DD 0x29024cb9,0xae88de1e,0x23fb1f06,0xa4718da1
+
+mul_table_312:
+DD 0x00000000,0xbac2fd7b,0x70698c07,0xcaab717c
+DD 0xe0d3180e,0x5a11e575,0x90ba9409,0x2a786972
+DD 0xc44a46ed,0x7e88bb96,0xb423caea,0x0ee13791
+DD 0x24995ee3,0x9e5ba398,0x54f0d2e4,0xee322f9f
+DD 0x8d78fb2b,0x37ba0650,0xfd11772c,0x47d38a57
+DD 0x6dabe325,0xd7691e5e,0x1dc26f22,0xa7009259
+DD 0x4932bdc6,0xf3f040bd,0x395b31c1,0x8399ccba
+DD 0xa9e1a5c8,0x132358b3,0xd98829cf,0x634ad4b4
+DD 0x1f1d80a7,0xa5df7ddc,0x6f740ca0,0xd5b6f1db
+DD 0xffce98a9,0x450c65d2,0x8fa714ae,0x3565e9d5
+DD 0xdb57c64a,0x61953b31,0xab3e4a4d,0x11fcb736
+DD 0x3b84de44,0x8146233f,0x4bed5243,0xf12faf38
+DD 0x92657b8c,0x28a786f7,0xe20cf78b,0x58ce0af0
+DD 0x72b66382,0xc8749ef9,0x02dfef85,0xb81d12fe
+DD 0x562f3d61,0xecedc01a,0x2646b166,0x9c844c1d
+DD 0xb6fc256f,0x0c3ed814,0xc695a968,0x7c575413
+DD 0x3e3b014e,0x84f9fc35,0x4e528d49,0xf4907032
+DD 0xdee81940,0x642ae43b,0xae819547,0x1443683c
+DD 0xfa7147a3,0x40b3bad8,0x8a18cba4,0x30da36df
+DD 0x1aa25fad,0xa060a2d6,0x6acbd3aa,0xd0092ed1
+DD 0xb343fa65,0x0981071e,0xc32a7662,0x79e88b19
+DD 0x5390e26b,0xe9521f10,0x23f96e6c,0x993b9317
+DD 0x7709bc88,0xcdcb41f3,0x0760308f,0xbda2cdf4
+DD 0x97daa486,0x2d1859fd,0xe7b32881,0x5d71d5fa
+DD 0x212681e9,0x9be47c92,0x514f0dee,0xeb8df095
+DD 0xc1f599e7,0x7b37649c,0xb19c15e0,0x0b5ee89b
+DD 0xe56cc704,0x5fae3a7f,0x95054b03,0x2fc7b678
+DD 0x05bfdf0a,0xbf7d2271,0x75d6530d,0xcf14ae76
+DD 0xac5e7ac2,0x169c87b9,0xdc37f6c5,0x66f50bbe
+DD 0x4c8d62cc,0xf64f9fb7,0x3ce4eecb,0x862613b0
+DD 0x68143c2f,0xd2d6c154,0x187db028,0xa2bf4d53
+DD 0x88c72421,0x3205d95a,0xf8aea826,0x426c555d
+DD 0x7c76029c,0xc6b4ffe7,0x0c1f8e9b,0xb6dd73e0
+DD 0x9ca51a92,0x2667e7e9,0xeccc9695,0x560e6bee
+DD 0xb83c4471,0x02feb90a,0xc855c876,0x7297350d
+DD 0x58ef5c7f,0xe22da104,0x2886d078,0x92442d03
+DD 0xf10ef9b7,0x4bcc04cc,0x816775b0,0x3ba588cb
+DD 0x11dde1b9,0xab1f1cc2,0x61b46dbe,0xdb7690c5
+DD 0x3544bf5a,0x8f864221,0x452d335d,0xffefce26
+DD 0xd597a754,0x6f555a2f,0xa5fe2b53,0x1f3cd628
+DD 0x636b823b,0xd9a97f40,0x13020e3c,0xa9c0f347
+DD 0x83b89a35,0x397a674e,0xf3d11632,0x4913eb49
+DD 0xa721c4d6,0x1de339ad,0xd74848d1,0x6d8ab5aa
+DD 0x47f2dcd8,0xfd3021a3,0x379b50df,0x8d59ada4
+DD 0xee137910,0x54d1846b,0x9e7af517,0x24b8086c
+DD 0x0ec0611e,0xb4029c65,0x7ea9ed19,0xc46b1062
+DD 0x2a593ffd,0x909bc286,0x5a30b3fa,0xe0f24e81
+DD 0xca8a27f3,0x7048da88,0xbae3abf4,0x0021568f
+DD 0x424d03d2,0xf88ffea9,0x32248fd5,0x88e672ae
+DD 0xa29e1bdc,0x185ce6a7,0xd2f797db,0x68356aa0
+DD 0x8607453f,0x3cc5b844,0xf66ec938,0x4cac3443
+DD 0x66d45d31,0xdc16a04a,0x16bdd136,0xac7f2c4d
+DD 0xcf35f8f9,0x75f70582,0xbf5c74fe,0x059e8985
+DD 0x2fe6e0f7,0x95241d8c,0x5f8f6cf0,0xe54d918b
+DD 0x0b7fbe14,0xb1bd436f,0x7b163213,0xc1d4cf68
+DD 0xebaca61a,0x516e5b61,0x9bc52a1d,0x2107d766
+DD 0x5d508375,0xe7927e0e,0x2d390f72,0x97fbf209
+DD 0xbd839b7b,0x07416600,0xcdea177c,0x7728ea07
+DD 0x991ac598,0x23d838e3,0xe973499f,0x53b1b4e4
+DD 0x79c9dd96,0xc30b20ed,0x09a05191,0xb362acea
+DD 0xd028785e,0x6aea8525,0xa041f459,0x1a830922
+DD 0x30fb6050,0x8a399d2b,0x4092ec57,0xfa50112c
+DD 0x14623eb3,0xaea0c3c8,0x640bb2b4,0xdec94fcf
+DD 0xf4b126bd,0x4e73dbc6,0x84d8aaba,0x3e1a57c1
+
+mul_table_632:
+DD 0x00000000,0x6b749fb2,0xd6e93f64,0xbd9da0d6
+DD 0xa83e0839,0xc34a978b,0x7ed7375d,0x15a3a8ef
+DD 0x55906683,0x3ee4f931,0x837959e7,0xe80dc655
+DD 0xfdae6eba,0x96daf108,0x2b4751de,0x4033ce6c
+DD 0xab20cd06,0xc05452b4,0x7dc9f262,0x16bd6dd0
+DD 0x031ec53f,0x686a5a8d,0xd5f7fa5b,0xbe8365e9
+DD 0xfeb0ab85,0x95c43437,0x285994e1,0x432d0b53
+DD 0x568ea3bc,0x3dfa3c0e,0x80679cd8,0xeb13036a
+DD 0x53adecfd,0x38d9734f,0x8544d399,0xee304c2b
+DD 0xfb93e4c4,0x90e77b76,0x2d7adba0,0x460e4412
+DD 0x063d8a7e,0x6d4915cc,0xd0d4b51a,0xbba02aa8
+DD 0xae038247,0xc5771df5,0x78eabd23,0x139e2291
+DD 0xf88d21fb,0x93f9be49,0x2e641e9f,0x4510812d
+DD 0x50b329c2,0x3bc7b670,0x865a16a6,0xed2e8914
+DD 0xad1d4778,0xc669d8ca,0x7bf4781c,0x1080e7ae
+DD 0x05234f41,0x6e57d0f3,0xd3ca7025,0xb8beef97
+DD 0xa75bd9fa,0xcc2f4648,0x71b2e69e,0x1ac6792c
+DD 0x0f65d1c3,0x64114e71,0xd98ceea7,0xb2f87115
+DD 0xf2cbbf79,0x99bf20cb,0x2422801d,0x4f561faf
+DD 0x5af5b740,0x318128f2,0x8c1c8824,0xe7681796
+DD 0x0c7b14fc,0x670f8b4e,0xda922b98,0xb1e6b42a
+DD 0xa4451cc5,0xcf318377,0x72ac23a1,0x19d8bc13
+DD 0x59eb727f,0x329fedcd,0x8f024d1b,0xe476d2a9
+DD 0xf1d57a46,0x9aa1e5f4,0x273c4522,0x4c48da90
+DD 0xf4f63507,0x9f82aab5,0x221f0a63,0x496b95d1
+DD 0x5cc83d3e,0x37bca28c,0x8a21025a,0xe1559de8
+DD 0xa1665384,0xca12cc36,0x778f6ce0,0x1cfbf352
+DD 0x09585bbd,0x622cc40f,0xdfb164d9,0xb4c5fb6b
+DD 0x5fd6f801,0x34a267b3,0x893fc765,0xe24b58d7
+DD 0xf7e8f038,0x9c9c6f8a,0x2101cf5c,0x4a7550ee
+DD 0x0a469e82,0x61320130,0xdcafa1e6,0xb7db3e54
+DD 0xa27896bb,0xc90c0909,0x7491a9df,0x1fe5366d
+DD 0x4b5bc505,0x202f5ab7,0x9db2fa61,0xf6c665d3
+DD 0xe365cd3c,0x8811528e,0x358cf258,0x5ef86dea
+DD 0x1ecba386,0x75bf3c34,0xc8229ce2,0xa3560350
+DD 0xb6f5abbf,0xdd81340d,0x601c94db,0x0b680b69
+DD 0xe07b0803,0x8b0f97b1,0x36923767,0x5de6a8d5
+DD 0x4845003a,0x23319f88,0x9eac3f5e,0xf5d8a0ec
+DD 0xb5eb6e80,0xde9ff132,0x630251e4,0x0876ce56
+DD 0x1dd566b9,0x76a1f90b,0xcb3c59dd,0xa048c66f
+DD 0x18f629f8,0x7382b64a,0xce1f169c,0xa56b892e
+DD 0xb0c821c1,0xdbbcbe73,0x66211ea5,0x0d558117
+DD 0x4d664f7b,0x2612d0c9,0x9b8f701f,0xf0fbefad
+DD 0xe5584742,0x8e2cd8f0,0x33b17826,0x58c5e794
+DD 0xb3d6e4fe,0xd8a27b4c,0x653fdb9a,0x0e4b4428
+DD 0x1be8ecc7,0x709c7375,0xcd01d3a3,0xa6754c11
+DD 0xe646827d,0x8d321dcf,0x30afbd19,0x5bdb22ab
+DD 0x4e788a44,0x250c15f6,0x9891b520,0xf3e52a92
+DD 0xec001cff,0x8774834d,0x3ae9239b,0x519dbc29
+DD 0x443e14c6,0x2f4a8b74,0x92d72ba2,0xf9a3b410
+DD 0xb9907a7c,0xd2e4e5ce,0x6f794518,0x040ddaaa
+DD 0x11ae7245,0x7adaedf7,0xc7474d21,0xac33d293
+DD 0x4720d1f9,0x2c544e4b,0x91c9ee9d,0xfabd712f
+DD 0xef1ed9c0,0x846a4672,0x39f7e6a4,0x52837916
+DD 0x12b0b77a,0x79c428c8,0xc459881e,0xaf2d17ac
+DD 0xba8ebf43,0xd1fa20f1,0x6c678027,0x07131f95
+DD 0xbfadf002,0xd4d96fb0,0x6944cf66,0x023050d4
+DD 0x1793f83b,0x7ce76789,0xc17ac75f,0xaa0e58ed
+DD 0xea3d9681,0x81490933,0x3cd4a9e5,0x57a03657
+DD 0x42039eb8,0x2977010a,0x94eaa1dc,0xff9e3e6e
+DD 0x148d3d04,0x7ff9a2b6,0xc2640260,0xa9109dd2
+DD 0xbcb3353d,0xd7c7aa8f,0x6a5a0a59,0x012e95eb
+DD 0x411d5b87,0x2a69c435,0x97f464e3,0xfc80fb51
+DD 0xe92353be,0x8257cc0c,0x3fca6cda,0x54bef368
+
+mul_table_1272:
+DD 0x00000000,0xdd66cbbb,0xbf21e187,0x62472a3c
+DD 0x7bafb5ff,0xa6c97e44,0xc48e5478,0x19e89fc3
+DD 0xf75f6bfe,0x2a39a045,0x487e8a79,0x951841c2
+DD 0x8cf0de01,0x519615ba,0x33d13f86,0xeeb7f43d
+DD 0xeb52a10d,0x36346ab6,0x5473408a,0x89158b31
+DD 0x90fd14f2,0x4d9bdf49,0x2fdcf575,0xf2ba3ece
+DD 0x1c0dcaf3,0xc16b0148,0xa32c2b74,0x7e4ae0cf
+DD 0x67a27f0c,0xbac4b4b7,0xd8839e8b,0x05e55530
+DD 0xd34934eb,0x0e2fff50,0x6c68d56c,0xb10e1ed7
+DD 0xa8e68114,0x75804aaf,0x17c76093,0xcaa1ab28
+DD 0x24165f15,0xf97094ae,0x9b37be92,0x46517529
+DD 0x5fb9eaea,0x82df2151,0xe0980b6d,0x3dfec0d6
+DD 0x381b95e6,0xe57d5e5d,0x873a7461,0x5a5cbfda
+DD 0x43b42019,0x9ed2eba2,0xfc95c19e,0x21f30a25
+DD 0xcf44fe18,0x122235a3,0x70651f9f,0xad03d424
+DD 0xb4eb4be7,0x698d805c,0x0bcaaa60,0xd6ac61db
+DD 0xa37e1f27,0x7e18d49c,0x1c5ffea0,0xc139351b
+DD 0xd8d1aad8,0x05b76163,0x67f04b5f,0xba9680e4
+DD 0x542174d9,0x8947bf62,0xeb00955e,0x36665ee5
+DD 0x2f8ec126,0xf2e80a9d,0x90af20a1,0x4dc9eb1a
+DD 0x482cbe2a,0x954a7591,0xf70d5fad,0x2a6b9416
+DD 0x33830bd5,0xeee5c06e,0x8ca2ea52,0x51c421e9
+DD 0xbf73d5d4,0x62151e6f,0x00523453,0xdd34ffe8
+DD 0xc4dc602b,0x19baab90,0x7bfd81ac,0xa69b4a17
+DD 0x70372bcc,0xad51e077,0xcf16ca4b,0x127001f0
+DD 0x0b989e33,0xd6fe5588,0xb4b97fb4,0x69dfb40f
+DD 0x87684032,0x5a0e8b89,0x3849a1b5,0xe52f6a0e
+DD 0xfcc7f5cd,0x21a13e76,0x43e6144a,0x9e80dff1
+DD 0x9b658ac1,0x4603417a,0x24446b46,0xf922a0fd
+DD 0xe0ca3f3e,0x3dacf485,0x5febdeb9,0x828d1502
+DD 0x6c3ae13f,0xb15c2a84,0xd31b00b8,0x0e7dcb03
+DD 0x179554c0,0xcaf39f7b,0xa8b4b547,0x75d27efc
+DD 0x431048bf,0x9e768304,0xfc31a938,0x21576283
+DD 0x38bffd40,0xe5d936fb,0x879e1cc7,0x5af8d77c
+DD 0xb44f2341,0x6929e8fa,0x0b6ec2c6,0xd608097d
+DD 0xcfe096be,0x12865d05,0x70c17739,0xada7bc82
+DD 0xa842e9b2,0x75242209,0x17630835,0xca05c38e
+DD 0xd3ed5c4d,0x0e8b97f6,0x6cccbdca,0xb1aa7671
+DD 0x5f1d824c,0x827b49f7,0xe03c63cb,0x3d5aa870
+DD 0x24b237b3,0xf9d4fc08,0x9b93d634,0x46f51d8f
+DD 0x90597c54,0x4d3fb7ef,0x2f789dd3,0xf21e5668
+DD 0xebf6c9ab,0x36900210,0x54d7282c,0x89b1e397
+DD 0x670617aa,0xba60dc11,0xd827f62d,0x05413d96
+DD 0x1ca9a255,0xc1cf69ee,0xa38843d2,0x7eee8869
+DD 0x7b0bdd59,0xa66d16e2,0xc42a3cde,0x194cf765
+DD 0x00a468a6,0xddc2a31d,0xbf858921,0x62e3429a
+DD 0x8c54b6a7,0x51327d1c,0x33755720,0xee139c9b
+DD 0xf7fb0358,0x2a9dc8e3,0x48dae2df,0x95bc2964
+DD 0xe06e5798,0x3d089c23,0x5f4fb61f,0x82297da4
+DD 0x9bc1e267,0x46a729dc,0x24e003e0,0xf986c85b
+DD 0x17313c66,0xca57f7dd,0xa810dde1,0x7576165a
+DD 0x6c9e8999,0xb1f84222,0xd3bf681e,0x0ed9a3a5
+DD 0x0b3cf695,0xd65a3d2e,0xb41d1712,0x697bdca9
+DD 0x7093436a,0xadf588d1,0xcfb2a2ed,0x12d46956
+DD 0xfc639d6b,0x210556d0,0x43427cec,0x9e24b757
+DD 0x87cc2894,0x5aaae32f,0x38edc913,0xe58b02a8
+DD 0x33276373,0xee41a8c8,0x8c0682f4,0x5160494f
+DD 0x4888d68c,0x95ee1d37,0xf7a9370b,0x2acffcb0
+DD 0xc478088d,0x191ec336,0x7b59e90a,0xa63f22b1
+DD 0xbfd7bd72,0x62b176c9,0x00f65cf5,0xdd90974e
+DD 0xd875c27e,0x051309c5,0x675423f9,0xba32e842
+DD 0xa3da7781,0x7ebcbc3a,0x1cfb9606,0xc19d5dbd
+DD 0x2f2aa980,0xf24c623b,0x900b4807,0x4d6d83bc
+DD 0x54851c7f,0x89e3d7c4,0xeba4fdf8,0x36c23643
+
+;;; func core, ver, snum
+slversion crc32_iscsi_00, 00, 03, 0014
+
diff --git a/src/isa-l/crc/crc32_iscsi_01.asm b/src/isa-l/crc/crc32_iscsi_01.asm
new file mode 100644
index 00000000..5b730f63
--- /dev/null
+++ b/src/isa-l/crc/crc32_iscsi_01.asm
@@ -0,0 +1,572 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+%include "reg_sizes.asm"
+
+default rel
+%define CONCAT(a,b,c) a %+ b %+ c
+
+; Define threshold where buffers are considered "small" and routed to more
+; efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
+; SMALL_SIZE can be no larger than 256.
+%define SMALL_SIZE 200
+
+%if (SMALL_SIZE > 256)
+%error SMALL_ SIZE must be <= 256
+% error ; needed because '%error' actually generates only a warning
+%endif
+
+;;; unsigned int crc32_iscsi_01(unsigned char * buffer, int len, unsigned int crc_init);
+;;;
+;;; *buf = rcx
+;;; len = rdx
+;;; crc_init = r8
+
+global crc32_iscsi_01:function
+crc32_iscsi_01:
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define bufp rdi
+%define bufp_dw edi
+%define bufp_w di
+%define bufp_b dil
+%define bufptmp rcx
+%define block_0 rcx
+%define block_1 rdx
+%define block_2 r11
+%define len rsi
+%define len_dw esi
+%define len_w si
+%define len_b sil
+%define crc_init_arg rdx
+%else
+%define bufp rcx
+%define bufp_dw ecx
+%define bufp_w cx
+%define bufp_b cl
+%define bufptmp rdi
+%define block_0 rdi
+%define block_1 rsi
+%define block_2 r11
+%define len rdx
+%define len_dw edx
+%define len_w dx
+%define len_b dl
+%endif
+
+%define tmp rbx
+%define crc_init r8
+%define crc_init_dw r8d
+%define crc1 r9
+%define crc2 r10
+
+ push rbx
+ push rdi
+ push rsi
+
+ ;; Move crc_init for Linux to a different reg
+%ifidn __OUTPUT_FORMAT__, elf64
+ mov crc_init, crc_init_arg
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov bufptmp, bufp ;; rdi = *buf
+ neg bufp
+ and bufp, 7 ;; calculate the unalignment amount of
+ ;; the address
+ je proc_block ;; Skip if aligned
+
+ ;; If len is less than 8 and we're unaligned, we need to jump
+ ;; to special code to avoid reading beyond the end of the buffer
+ cmp len, 8
+ jb less_than_8
+
+ ;;;; Calculate CRC of unaligned bytes of the buffer (if any) ;;;
+ mov tmp, [bufptmp] ;; load a quadword from the buffer
+ add bufptmp, bufp ;; align buffer pointer for quadword
+ ;; processing
+ sub len, bufp ;; update buffer length
+align_loop:
+ crc32 crc_init_dw, bl ;; compute crc32 of 1-byte
+ shr tmp, 8 ;; get next byte
+ dec bufp
+ jne align_loop
+
+proc_block:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2) PROCESS BLOCKS: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; compute num of bytes to be processed
+ mov tmp, len ;; save num bytes in tmp
+
+ cmp len, 128*24
+ jae full_block
+
+continue_block:
+ cmp len, SMALL_SIZE
+ jb small
+
+ ;; len < 128*24
+ mov rax, 2731 ;; 2731 = ceil(2^16 / 24)
+ mul len_dw
+ shr rax, 16
+
+ ;; eax contains floor(bytes / 24) = num 24-byte chunks to do
+
+ ;; process rax 24-byte chunks (128 >= rax >= 0)
+
+ ;; compute end address of each block
+ ;; rdi -> block 0 (base addr + RAX * 8)
+ ;; rsi -> block 1 (base addr + RAX * 16)
+ ;; r11 -> block 2 (base addr + RAX * 24)
+ lea block_0, [bufptmp + rax * 8]
+ lea block_1, [block_0 + rax * 8]
+ lea block_2, [block_1 + rax * 8]
+
+ xor crc1,crc1
+ xor crc2,crc2
+
+ ;; branch into array
+ lea bufp, [jump_table]
+ movzx len, word [bufp + rax * 2] ;; len is offset from crc_array
+ lea bufp, [bufp + len + crc_array - jump_table]
+ jmp bufp
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2a) PROCESS FULL BLOCKS: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+full_block:
+ mov rax, 128
+ lea block_1, [block_0 + 128*8*2]
+ lea block_2, [block_0 + 128*8*3]
+ add block_0, 128*8*1
+
+ xor crc1,crc1
+ xor crc2,crc2
+
+; ;; branch into array
+; jmp CONCAT(crc_,128,)
+ ; Fall thruogh into top of crc array (crc_128)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 3) CRC Array: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+crc_array:
+%assign i 128
+%rep 128-1
+CONCAT(crc_,i,:)
+ crc32 crc_init, qword [block_0 - i*8]
+ crc32 crc1, qword [block_1 - i*8]
+ crc32 crc2, qword [block_2 - i*8]
+
+ %if i > 128*8 / 32 ; prefetch next 3KB data
+ prefetchnta [block_2 + 128*32 - i*32]
+ %endif
+%assign i (i-1)
+%endrep
+
+CONCAT(crc_,i,:)
+ crc32 crc_init, qword [block_0 - i*8]
+ crc32 crc1, qword [block_1 - i*8]
+; SKIP ;crc32 crc2, [block_2 - i*8] ; Don't do this one yet
+
+ mov block_0, block_2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 4) Combine three results: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ lea bufp, [K_table - 16] ; first entry is for idx 1
+ shl rax, 3 ; rax *= 8
+ sub tmp, rax ; tmp -= rax*8
+ shl rax, 1
+ sub tmp, rax ; tmp -= rax*16 (total tmp -= rax*24)
+ add bufp, rax
+
+ movdqa xmm0, [bufp] ; 2 consts: K1:K2
+
+ movq xmm1, crc_init ; CRC for block 1
+ pclmulqdq xmm1, xmm0, 0x00 ; Multiply by K2
+
+ movq xmm2, crc1 ; CRC for block 2
+ pclmulqdq xmm2, xmm0, 0x10 ; Multiply by K1
+
+ pxor xmm1, xmm2
+ movq rax, xmm1
+ xor rax, [block_2 - i*8]
+ mov crc_init, crc2
+ crc32 crc_init, rax
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 5) Check for end: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+CONCAT(crc_,0,:)
+ mov len, tmp
+ cmp tmp, 128*24
+ jae full_block
+ cmp tmp, 24
+ jae continue_block
+
+fewer_than_24:
+ ;; now fewer than 24 bytes remain
+ cmp tmp, 16
+ jae do_16
+ cmp tmp, 8
+ jae do_8
+
+ ;; 0 <= tmp <= 7
+ shl ebx, 29 ; size now in bits 31:29
+ jz do_return
+check_4:
+ mov bufp, [bufptmp]
+ shl ebx, 1 ; shift out into carry MSB (orig size & 4)
+ jnc check_2
+ crc32 crc_init_dw, bufp_dw
+ jz do_return
+ shr bufp, 32 ; shift data down by 4 bytes
+check_2:
+ shl ebx, 1 ; shift out into carry MSB (orig size & 2)
+ jnc check_1
+ crc32 crc_init_dw, bufp_w
+ jz do_return
+ shr bufp, 16 ; shift data down by 2 bytes
+check_1:
+ crc32 crc_init_dw, bufp_b
+
+do_return:
+ mov rax, crc_init
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+do_8:
+ crc32 crc_init, qword [bufptmp]
+ add bufptmp, 8
+ shl ebx, 29 ; size (0...7) in bits 31:29
+ jnz check_4
+ mov rax, crc_init
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+do_16:
+ crc32 crc_init, qword [bufptmp]
+ crc32 crc_init, qword [bufptmp+8]
+ add bufptmp, 16
+ shl ebx, 29 ; size (0...7) in bits 31:29
+ jnz check_4
+ mov rax, crc_init
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Handle the case of fewer than 8 bytes, unaligned. In this case
+ ;; we can't read 8 bytes, as this might go beyond the end of the buffer
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+less_than_8:
+ test len,4
+ jz less_than_4
+ crc32 crc_init_dw, dword[bufptmp]
+ add bufptmp,4
+less_than_4:
+ test len,2
+ jz less_than_2
+ crc32 crc_init_dw, word[bufptmp]
+ add bufptmp,2
+less_than_2:
+ test len,1
+ jz do_return
+ crc32 crc_init_dw, byte[bufptmp]
+ mov rax, crc_init
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+
+small:
+ mov rax, crc_init
+
+bit8:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit7 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 16
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 128 ;; buf +=64; (next 64 bytes)
+
+bit7:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit6 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 8
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 64 ;; buf +=64; (next 64 bytes)
+bit6:
+ shl len_b, 1 ;; shift-out MSB (bit-6)
+ jnc bit5 ;; jump to bit-5 if bit-6 == 0
+ %assign i 0
+ %rep 4
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 32 ;; buf +=32; (next 32 bytes)
+bit5:
+ shl len_b, 1 ;; shift-out MSB (bit-5)
+ jnc bit4 ;; jump to bit-4 if bit-5 == 0
+ %assign i 0
+ %rep 2
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 16 ;; buf +=16; (next 16 bytes)
+bit4:
+ shl len_b, 1 ;; shift-out MSB (bit-4)
+ jnc bit3 ;; jump to bit-3 if bit-4 == 0
+ crc32 rax, qword [bufptmp] ;; compute crc32 of 8-byte data
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 8 ;; buf +=8; (next 8 bytes)
+bit3:
+ mov rbx, qword [bufptmp] ;; load a 8-bytes from the buffer:
+ shl len_b, 1 ;; shift-out MSB (bit-3)
+ jnc bit2 ;; jump to bit-2 if bit-3 == 0
+ crc32 eax, ebx ;; compute crc32 of 4-byte data
+ je do_return2 ;; return if remaining data is zero
+ shr rbx, 32 ;; get next 3 bytes
+bit2:
+ shl len_b, 1 ;; shift-out MSB (bit-2)
+ jnc bit1 ;; jump to bit-1 if bit-2 == 0
+ crc32 eax, bx ;; compute crc32 of 2-byte data
+ je do_return2 ;; return if remaining data is zero
+ shr rbx, 16 ;; next byte
+bit1:
+ test len_b,len_b
+ je do_return2
+ crc32 eax, bl ;; compute crc32 of 1-byte data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+do_return2:
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; jump table ;; Table is 129 entries x 2 bytes each
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+align 4
+jump_table:
+%assign i 0
+%rep 129
+ dw CONCAT(crc_,i,) - crc_array
+%assign i (i+1)
+%endrep
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; PCLMULQDQ tables
+ ;; Table is 128 entries x 2 quad words each
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .data
+align 64
+K_table:
+ dq 0x14cd00bd6, 0x105ec76f0
+ dq 0x0ba4fc28e, 0x14cd00bd6
+ dq 0x1d82c63da, 0x0f20c0dfe
+ dq 0x09e4addf8, 0x0ba4fc28e
+ dq 0x039d3b296, 0x1384aa63a
+ dq 0x102f9b8a2, 0x1d82c63da
+ dq 0x14237f5e6, 0x01c291d04
+ dq 0x00d3b6092, 0x09e4addf8
+ dq 0x0c96cfdc0, 0x0740eef02
+ dq 0x18266e456, 0x039d3b296
+ dq 0x0daece73e, 0x0083a6eec
+ dq 0x0ab7aff2a, 0x102f9b8a2
+ dq 0x1248ea574, 0x1c1733996
+ dq 0x083348832, 0x14237f5e6
+ dq 0x12c743124, 0x02ad91c30
+ dq 0x0b9e02b86, 0x00d3b6092
+ dq 0x018b33a4e, 0x06992cea2
+ dq 0x1b331e26a, 0x0c96cfdc0
+ dq 0x17d35ba46, 0x07e908048
+ dq 0x1bf2e8b8a, 0x18266e456
+ dq 0x1a3e0968a, 0x11ed1f9d8
+ dq 0x0ce7f39f4, 0x0daece73e
+ dq 0x061d82e56, 0x0f1d0f55e
+ dq 0x0d270f1a2, 0x0ab7aff2a
+ dq 0x1c3f5f66c, 0x0a87ab8a8
+ dq 0x12ed0daac, 0x1248ea574
+ dq 0x065863b64, 0x08462d800
+ dq 0x11eef4f8e, 0x083348832
+ dq 0x1ee54f54c, 0x071d111a8
+ dq 0x0b3e32c28, 0x12c743124
+ dq 0x0064f7f26, 0x0ffd852c6
+ dq 0x0dd7e3b0c, 0x0b9e02b86
+ dq 0x0f285651c, 0x0dcb17aa4
+ dq 0x010746f3c, 0x018b33a4e
+ dq 0x1c24afea4, 0x0f37c5aee
+ dq 0x0271d9844, 0x1b331e26a
+ dq 0x08e766a0c, 0x06051d5a2
+ dq 0x093a5f730, 0x17d35ba46
+ dq 0x06cb08e5c, 0x11d5ca20e
+ dq 0x06b749fb2, 0x1bf2e8b8a
+ dq 0x1167f94f2, 0x021f3d99c
+ dq 0x0cec3662e, 0x1a3e0968a
+ dq 0x19329634a, 0x08f158014
+ dq 0x0e6fc4e6a, 0x0ce7f39f4
+ dq 0x08227bb8a, 0x1a5e82106
+ dq 0x0b0cd4768, 0x061d82e56
+ dq 0x13c2b89c4, 0x188815ab2
+ dq 0x0d7a4825c, 0x0d270f1a2
+ dq 0x10f5ff2ba, 0x105405f3e
+ dq 0x00167d312, 0x1c3f5f66c
+ dq 0x0f6076544, 0x0e9adf796
+ dq 0x026f6a60a, 0x12ed0daac
+ dq 0x1a2adb74e, 0x096638b34
+ dq 0x19d34af3a, 0x065863b64
+ dq 0x049c3cc9c, 0x1e50585a0
+ dq 0x068bce87a, 0x11eef4f8e
+ dq 0x1524fa6c6, 0x19f1c69dc
+ dq 0x16cba8aca, 0x1ee54f54c
+ dq 0x042d98888, 0x12913343e
+ dq 0x1329d9f7e, 0x0b3e32c28
+ dq 0x1b1c69528, 0x088f25a3a
+ dq 0x02178513a, 0x0064f7f26
+ dq 0x0e0ac139e, 0x04e36f0b0
+ dq 0x0170076fa, 0x0dd7e3b0c
+ dq 0x141a1a2e2, 0x0bd6f81f8
+ dq 0x16ad828b4, 0x0f285651c
+ dq 0x041d17b64, 0x19425cbba
+ dq 0x1fae1cc66, 0x010746f3c
+ dq 0x1a75b4b00, 0x18db37e8a
+ dq 0x0f872e54c, 0x1c24afea4
+ dq 0x01e41e9fc, 0x04c144932
+ dq 0x086d8e4d2, 0x0271d9844
+ dq 0x160f7af7a, 0x052148f02
+ dq 0x05bb8f1bc, 0x08e766a0c
+ dq 0x0a90fd27a, 0x0a3c6f37a
+ dq 0x0b3af077a, 0x093a5f730
+ dq 0x04984d782, 0x1d22c238e
+ dq 0x0ca6ef3ac, 0x06cb08e5c
+ dq 0x0234e0b26, 0x063ded06a
+ dq 0x1d88abd4a, 0x06b749fb2
+ dq 0x04597456a, 0x04d56973c
+ dq 0x0e9e28eb4, 0x1167f94f2
+ dq 0x07b3ff57a, 0x19385bf2e
+ dq 0x0c9c8b782, 0x0cec3662e
+ dq 0x13a9cba9e, 0x0e417f38a
+ dq 0x093e106a4, 0x19329634a
+ dq 0x167001a9c, 0x14e727980
+ dq 0x1ddffc5d4, 0x0e6fc4e6a
+ dq 0x00df04680, 0x0d104b8fc
+ dq 0x02342001e, 0x08227bb8a
+ dq 0x00a2a8d7e, 0x05b397730
+ dq 0x168763fa6, 0x0b0cd4768
+ dq 0x1ed5a407a, 0x0e78eb416
+ dq 0x0d2c3ed1a, 0x13c2b89c4
+ dq 0x0995a5724, 0x1641378f0
+ dq 0x19b1afbc4, 0x0d7a4825c
+ dq 0x109ffedc0, 0x08d96551c
+ dq 0x0f2271e60, 0x10f5ff2ba
+ dq 0x00b0bf8ca, 0x00bf80dd2
+ dq 0x123888b7a, 0x00167d312
+ dq 0x1e888f7dc, 0x18dcddd1c
+ dq 0x002ee03b2, 0x0f6076544
+ dq 0x183e8d8fe, 0x06a45d2b2
+ dq 0x133d7a042, 0x026f6a60a
+ dq 0x116b0f50c, 0x1dd3e10e8
+ dq 0x05fabe670, 0x1a2adb74e
+ dq 0x130004488, 0x0de87806c
+ dq 0x000bcf5f6, 0x19d34af3a
+ dq 0x18f0c7078, 0x014338754
+ dq 0x017f27698, 0x049c3cc9c
+ dq 0x058ca5f00, 0x15e3e77ee
+ dq 0x1af900c24, 0x068bce87a
+ dq 0x0b5cfca28, 0x0dd07448e
+ dq 0x0ded288f8, 0x1524fa6c6
+ dq 0x059f229bc, 0x1d8048348
+ dq 0x06d390dec, 0x16cba8aca
+ dq 0x037170390, 0x0a3e3e02c
+ dq 0x06353c1cc, 0x042d98888
+ dq 0x0c4584f5c, 0x0d73c7bea
+ dq 0x1f16a3418, 0x1329d9f7e
+ dq 0x0531377e2, 0x185137662
+ dq 0x1d8d9ca7c, 0x1b1c69528
+ dq 0x0b25b29f2, 0x18a08b5bc
+ dq 0x19fb2a8b0, 0x02178513a
+ dq 0x1a08fe6ac, 0x1da758ae0
+ dq 0x045cddf4e, 0x0e0ac139e
+ dq 0x1a91647f2, 0x169cf9eb0
+ dq 0x1a0f717c4, 0x0170076fa
+
+;;; func core, ver, snum
+slversion crc32_iscsi_01, 01, 03, 0015
+
diff --git a/src/isa-l/crc/crc32_iscsi_perf.c b/src/isa-l/crc/crc32_iscsi_perf.c
new file mode 100644
index 00000000..2770099a
--- /dev/null
+++ b/src/isa-l/crc/crc32_iscsi_perf.c
@@ -0,0 +1,87 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 1000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 500
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+ int i;
+ void *buf;
+ uint32_t crc;
+ struct perf start, stop;
+
+ printf("crc32_iscsi_perf:\n");
+
+ if (posix_memalign(&buf, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ memset(buf, 0, TEST_LEN);
+ crc = crc32_iscsi(buf, TEST_LEN, TEST_SEED);
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ crc = crc32_iscsi(buf, TEST_LEN, TEST_SEED);
+ }
+ perf_stop(&stop);
+ printf("crc32_iscsi" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ printf("finish 0x%x\n", crc);
+ return 0;
+}
diff --git a/src/isa-l/crc/crc32_iscsi_test.c b/src/isa-l/crc/crc32_iscsi_test.c
new file mode 100644
index 00000000..e37f23f1
--- /dev/null
+++ b/src/isa-l/crc/crc32_iscsi_test.c
@@ -0,0 +1,171 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "crc.h"
+#include "types.h"
+
+unsigned long crc32_table_iscsi[256] = {
+ 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
+ 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+ 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
+ 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+ 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
+ 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+ 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
+ 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+ 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
+ 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+ 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
+ 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+ 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
+ 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+ 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
+ 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+ 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
+ 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+ 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
+ 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+ 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
+ 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+ 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
+ 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+ 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
+ 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+ 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
+ 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+ 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
+ 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+ 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
+ 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+ 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
+ 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+ 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
+ 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+ 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
+ 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+ 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
+ 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+ 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
+ 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+ 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
+ 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+ 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
+ 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+ 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
+ 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+ 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
+ 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+ 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
+ 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+ 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
+ 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+ 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
+ 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+ 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
+ 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+ 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
+ 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+ 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
+ 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+ 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
+ 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351,
+};
+
+#define PAGESIZE 10240
+
+int main(void)
+{
+ unsigned int i, j, good, test, init_crc = 1;
+
+ printf("crc32_iscsi_test: ");
+
+ unsigned char *q_buf = malloc(PAGESIZE);
+ if (q_buf == NULL) {
+ printf("alloc of q_buf failed\n");
+ return -1;
+ }
+ // fill q_buf with semi-random data
+ for (i = 0; i < PAGESIZE; i++)
+ q_buf[i] = (unsigned char)(i ^ (13 + (i >> 8)) ^ ((i >> 16) - 13));
+
+ // Test case 1: Compare against base/simple crc32 implementation and
+ // try all offsets/alignments of buffer.
+
+ for (j = 0; j < 128; j++) {
+ for (i = 0; i < PAGESIZE - j; i++) {
+ good = crc32_iscsi_base(q_buf + j, i, -1);
+ test = crc32_iscsi(q_buf + j, i, -1);
+ if (good != test) {
+ printf("Error for size %d offset %d, %08X should be %08X\n",
+ i, j, test, good);
+ return -1;
+ }
+ } // end for i
+ putchar('.');
+ fflush(0);
+ } // end for j
+
+ // Test case 2: Also vary initial CRC
+
+ for (j = 0; j < 128; j++) { // do all offsets
+ for (i = 0; i < PAGESIZE - j; i++) {
+ good = crc32_iscsi_base(q_buf + j, i, init_crc);
+ test = crc32_iscsi(q_buf + j, i, init_crc);
+ if (good != test) {
+ printf("Error for size %d offset %d, %08X should be %08X\n",
+ i, j, test, good);
+ return -1;
+ }
+ // modify init_crc semi-randomly
+ init_crc ^= 1 << ((i * 3 + j * 5) & 31);
+ } // end for i
+ putchar('.');
+ fflush(0);
+ } // end for j
+
+ // Test case 3: do end of buffer
+
+ for (i = 0; i < PAGESIZE; i++) {
+ good = crc32_iscsi_base(q_buf + i, PAGESIZE - i, -1);
+ test = crc32_iscsi(q_buf + i, PAGESIZE - i, -1);
+ if (good != test) {
+ printf("Error for size %d at eob, %08X should be %08X\n",
+ i, test, good);
+ return -1;
+ }
+ } // end for i
+ putchar('.');
+ fflush(0);
+
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/isa-l/crc/crc64_base.c b/src/isa-l/crc/crc64_base.c
new file mode 100644
index 00000000..29166f90
--- /dev/null
+++ b/src/isa-l/crc/crc64_base.c
@@ -0,0 +1,159 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include "crc64.h"
+
+#define MAX_ITER 8
+
+// crc64_ecma baseline function
+// Slow crc64 from the definition. Can be sped up with a lookup table.
+uint64_t crc64_ecma_refl_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0xC96C5795D7870F42ULL; // ECMA-182 standard reflected
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (uint64_t) buf[i];
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x1ULL ? poly : 0) ^ (rem >> 1);
+ }
+ }
+ return ~rem;
+}
+
+uint64_t crc64_ecma_norm_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0x42F0E1EBA9EA3693ULL; // ECMA-182 standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ ((uint64_t) buf[i] << 56);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x8000000000000000ULL ? poly : 0) ^ (rem << 1);
+ }
+ }
+ return ~rem;
+}
+
+// crc64_iso baseline function
+// Slow crc64 from the definition. Can be sped up with a lookup table.
+uint64_t crc64_iso_refl_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0xD800000000000000ULL; // ISO standard reflected
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (uint64_t) buf[i];
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x1ULL ? poly : 0) ^ (rem >> 1);
+ }
+ }
+ return ~rem;
+}
+
+uint64_t crc64_iso_norm_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0x000000000000001BULL; // ISO standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ ((uint64_t) buf[i] << 56);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x8000000000000000ULL ? poly : 0) ^ (rem << 1);
+ }
+ }
+ return ~rem;
+}
+
+// crc64_jones baseline function
+// Slow crc64 from the definition. Can be sped up with a lookup table.
+uint64_t crc64_jones_refl_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0x95ac9329ac4bc9b5ULL; // Jones coefficients reflected
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (uint64_t) buf[i];
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x1ULL ? poly : 0) ^ (rem >> 1);
+ }
+ }
+ return ~rem;
+}
+
+uint64_t crc64_jones_norm_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0xad93d23594c935a9ULL; // Jones coefficients
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ ((uint64_t) buf[i] << 56);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x8000000000000000ULL ? poly : 0) ^ (rem << 1);
+ }
+ }
+ return ~rem;
+}
+
+struct slver {
+ unsigned short snum;
+ unsigned char ver;
+ unsigned char core;
+};
+
+struct slver crc64_ecma_refl_base_slver_0000001c;
+struct slver crc64_ecma_refl_base_slver = { 0x001c, 0x00, 0x00 };
+
+struct slver crc64_ecma_norm_base_slver_00000019;
+struct slver crc64_ecma_norm_base_slver = { 0x0019, 0x00, 0x00 };
+
+struct slver crc64_iso_refl_base_slver_00000022;
+struct slver crc64_iso_refl_base_slver = { 0x0022, 0x00, 0x00 };
+
+struct slver crc64_iso_norm_base_slver_0000001f;
+struct slver crc64_iso_norm_base_slver = { 0x001f, 0x00, 0x00 };
+
+struct slver crc64_jones_refl_base_slver_00000028;
+struct slver crc64_jones_refl_base_slver = { 0x0028, 0x00, 0x00 };
+
+struct slver crc64_jones_norm_base_slver_00000025;
+struct slver crc64_jones_norm_base_slver = { 0x0025, 0x00, 0x00 };
diff --git a/src/isa-l/crc/crc64_ecma_norm_by8.asm b/src/isa-l/crc/crc64_ecma_norm_by8.asm
new file mode 100644
index 00000000..cff01e1c
--- /dev/null
+++ b/src/isa-l/crc/crc64_ecma_norm_by8.asm
@@ -0,0 +1,583 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; uint64_t crc64_ecma_norm_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+align 16
+global crc64_ecma_norm_by8:function
+crc64_ecma_norm_by8:
+
+ not arg1 ;~init_crc
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 8
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x01 ; H*L
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+
+ movdqa xmm1, xmm7
+ pand xmm1, [mask3]
+ pclmulqdq xmm7, xmm10, 0x01
+ pxor xmm7, xmm1
+
+ pclmulqdq xmm7, xmm10, 0x11
+ pxor xmm7, xmm0
+ pextrq rax, xmm7, 0
+
+_cleanup:
+ not rax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Right shift (8-length) bytes in XMM
+ add rax, 8
+ movdqu xmm0, [rax]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+
+rk1 :
+DQ 0x5f5c3c7eb52fab6
+rk2 :
+DQ 0x4eb938a7d257740e
+rk3 :
+DQ 0x5cf79dea9ac37d6
+rk4 :
+DQ 0x001067e571d7d5c2
+rk5 :
+DQ 0x5f5c3c7eb52fab6
+rk6 :
+DQ 0x0000000000000000
+rk7 :
+DQ 0x578d29d06cc4f872
+rk8 :
+DQ 0x42f0e1eba9ea3693
+rk9 :
+DQ 0xe464f4df5fb60ac1
+rk10 :
+DQ 0xb649c5b35a759cf2
+rk11 :
+DQ 0x9af04e1eff82d0dd
+rk12 :
+DQ 0x6e82e609297f8fe8
+rk13 :
+DQ 0x97c516e98bd2e73
+rk14 :
+DQ 0xb76477b31e22e7b
+rk15 :
+DQ 0x5f6843ca540df020
+rk16 :
+DQ 0xddf4b6981205b83f
+rk17 :
+DQ 0x54819d8713758b2c
+rk18 :
+DQ 0x4a6b90073eb0af5a
+rk19 :
+DQ 0x571bee0a227ef92b
+rk20 :
+DQ 0x44bef2a201b5200c
+
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+mask3:
+dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_ecma_norm_by8, 01, 00, 001a
diff --git a/src/isa-l/crc/crc64_ecma_refl_by8.asm b/src/isa-l/crc/crc64_ecma_refl_by8.asm
new file mode 100644
index 00000000..9d3847e9
--- /dev/null
+++ b/src/isa-l/crc/crc64_ecma_refl_by8.asm
@@ -0,0 +1,548 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; uint64_t crc64_ecma_refl_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; sample yasm command line:
+; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+
+align 16
+global crc64_ecma_refl_by8:function
+crc64_ecma_refl_by8:
+ ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
+ not arg1
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm1, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm3, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm5, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm7, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+ ; xmm0 to xmm7
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+ ;xmm1 to xmm7
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+ ; xmm6 to xmm7
+ movdqa xmm10, [rk1]
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ add arg3, 16
+ je _128_done
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+
+
+ movdqa xmm2, xmm7
+ movdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ movdqu xmm0, [rax]
+
+
+ pshufb xmm7, xmm0
+ pxor xmm0, [mask3]
+ pshufb xmm2, xmm0
+
+ pblendvb xmm2, xmm1 ;xmm0 is implicit
+ ;;;;;;;;;;
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5]
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0
+ psrldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm1, xmm7
+ movdqa xmm10, [rk7]
+
+ pclmulqdq xmm7, xmm10, 0
+ movdqa xmm2, xmm7
+ pclmulqdq xmm7, xmm10, 0x10
+ pslldq xmm2, 8
+ pxor xmm7, xmm2
+ pxor xmm7, xmm1
+ pextrq rax, xmm7, 1
+
+_cleanup:
+ ; return c ^ 0xffffffff, ffffffffL;
+ not rax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movq xmm0, arg1 ; get the initial crc value
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax + r9]
+ pshufb xmm7,xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Left shift (8-length) bytes in XMM
+ movdqu xmm0, [rax + r9 + 8]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+; rk7 = floor(2^128/Q)
+; rk8 = Q
+rk1 :
+DQ 0xdabe95afc7875f40
+rk2 :
+DQ 0xe05dd497ca393ae4
+rk3 :
+DQ 0xd7d86b2af73de740
+rk4 :
+DQ 0x8757d71d4fcc1000
+rk5 :
+DQ 0xdabe95afc7875f40
+rk6 :
+DQ 0x0000000000000000
+rk7 :
+DQ 0x9c3e466c172963d5
+rk8 :
+DQ 0x92d8af2baf0e1e84
+rk9 :
+DQ 0x947874de595052cb
+rk10 :
+DQ 0x9e735cb59b4724da
+rk11 :
+DQ 0xe4ce2cd55fea0037
+rk12 :
+DQ 0x2fe3fd2920ce82ec
+rk13 :
+DQ 0xe31d519421a63a5
+rk14 :
+DQ 0x2e30203212cac325
+rk15 :
+DQ 0x81f6054a7842df4
+rk16 :
+DQ 0x6ae3efbb9dd441f3
+rk17 :
+DQ 0x69a35d91c3730254
+rk18 :
+DQ 0xb5ea1af9c013aca4
+rk19 :
+DQ 0x3be653a30fe1af51
+rk20 :
+DQ 0x60095b008a9efa44
+
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+
+mask:
+dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2:
+dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3:
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_ecma_refl_by8, 01, 00, 001d
diff --git a/src/isa-l/crc/crc64_example.c b/src/isa-l/crc/crc64_example.c
new file mode 100644
index 00000000..64763a1b
--- /dev/null
+++ b/src/isa-l/crc/crc64_example.c
@@ -0,0 +1,68 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "crc64.h"
+
+#define BUF_SIZE 8192
+#define INIT_SEED 0x12345678
+
+int main(int argc, char *argv[])
+{
+ uint8_t inbuf[BUF_SIZE];
+ uint64_t avail_in, total_in = 0;
+ uint64_t crc64_checksum;
+ FILE *in;
+
+ if (argc != 2) {
+ fprintf(stderr, "Usage: crc64_example infile\n");
+ exit(0);
+ }
+ in = fopen(argv[1], "rb");
+ if (!in) {
+ fprintf(stderr, "Can't open %s for reading\n", argv[1]);
+ exit(0);
+ }
+
+ printf("crc64_example -- crc64_ecma_refl:\n");
+ fflush(0);
+
+ crc64_checksum = INIT_SEED;
+ while ((avail_in = fread(inbuf, 1, BUF_SIZE, in))) {
+ // crc update mode
+ crc64_checksum = crc64_ecma_refl(crc64_checksum, inbuf, avail_in);
+ total_in += avail_in;
+ }
+
+ fclose(in);
+ printf("total length is %ld, checksum is 0x%lx\n", total_in, crc64_checksum);
+
+ return 0;
+}
diff --git a/src/isa-l/crc/crc64_funcs_perf.c b/src/isa-l/crc/crc64_funcs_perf.c
new file mode 100644
index 00000000..04135bff
--- /dev/null
+++ b/src/isa-l/crc/crc64_funcs_perf.c
@@ -0,0 +1,109 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc64.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+typedef uint64_t(*crc64_func_t) (uint64_t, const uint8_t *, uint64_t);
+
+typedef struct func_case {
+ char *note;
+ crc64_func_t crc64_func_call;
+ crc64_func_t crc64_ref_call;
+} func_case_t;
+
+func_case_t test_funcs[] = {
+ {"crc64_ecma_norm", crc64_ecma_norm, crc64_ecma_norm_base},
+ {"crc64_ecma_refl", crc64_ecma_refl, crc64_ecma_refl_base},
+ {"crc64_iso_norm", crc64_iso_norm, crc64_iso_norm_base},
+ {"crc64_iso_refl", crc64_iso_refl, crc64_iso_refl_base},
+ {"crc64_jones_norm", crc64_jones_norm, crc64_jones_norm_base},
+ {"crc64_jones_refl", crc64_jones_refl, crc64_jones_refl_base}
+};
+
+int main(int argc, char *argv[])
+{
+ int i, j;
+ void *buf;
+ uint64_t crc;
+ struct perf start, stop;
+ func_case_t *test_func;
+
+ if (posix_memalign(&buf, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ memset(buf, (char)TEST_SEED, TEST_LEN);
+
+ for (j = 0; j < sizeof(test_funcs) / sizeof(test_funcs[0]); j++) {
+ test_func = &test_funcs[j];
+ printf("%s_perf:\n", test_func->note);
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ crc = test_func->crc64_func_call(TEST_SEED, buf, TEST_LEN);
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ crc = test_func->crc64_func_call(TEST_SEED, buf, TEST_LEN);
+ }
+ perf_stop(&stop);
+ printf("%s" TEST_TYPE_STR ": ", test_func->note);
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ printf("finish 0x%lx\n", crc);
+ }
+
+ return 0;
+}
diff --git a/src/isa-l/crc/crc64_funcs_test.c b/src/isa-l/crc/crc64_funcs_test.c
new file mode 100644
index 00000000..f638f0f9
--- /dev/null
+++ b/src/isa-l/crc/crc64_funcs_test.c
@@ -0,0 +1,290 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "crc64.h"
+#include "types.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define MAX_BUF 512
+#define TEST_SIZE 20
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+typedef uint64_t(*crc64_func_t) (uint64_t, const uint8_t *, uint64_t);
+
+typedef struct func_case {
+ char *note;
+ crc64_func_t crc64_func_call;
+ crc64_func_t crc64_ref_call;
+} func_case_t;
+
+func_case_t test_funcs[] = {
+ {"crc64_ecma_norm", crc64_ecma_norm, crc64_ecma_norm_base},
+ {"crc64_ecma_refl", crc64_ecma_refl, crc64_ecma_refl_base},
+ {"crc64_iso_norm", crc64_iso_norm, crc64_iso_norm_base},
+ {"crc64_iso_refl", crc64_iso_refl, crc64_iso_refl_base},
+ {"crc64_jones_norm", crc64_jones_norm, crc64_jones_norm_base},
+ {"crc64_jones_refl", crc64_jones_refl, crc64_jones_refl_base}
+};
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+// Test cases
+int zeros_test(func_case_t * test_func);
+
+int simple_pattern_test(func_case_t * test_func);
+
+int seeds_sizes_test(func_case_t * test_func);
+
+int eob_test(func_case_t * test_func);
+
+int update_test(func_case_t * test_func);
+
+int verbose = 0;
+void *buf_alloc = NULL;
+
+int main(int argc, char *argv[])
+{
+ int fail = 0, fail_case;
+ int i, ret;
+ func_case_t *test_func;
+
+ verbose = argc - 1;
+
+ // Align to MAX_BUF boundary
+ ret = posix_memalign(&buf_alloc, MAX_BUF, MAX_BUF * TEST_SIZE);
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ srand(TEST_SEED);
+ printf("CRC64 Tests\n");
+
+ for (i = 0; i < sizeof(test_funcs) / sizeof(test_funcs[0]); i++) {
+ fail_case = 0;
+ test_func = &test_funcs[i];
+
+ printf("Test %s ", test_func->note);
+ fail_case += zeros_test(test_func);
+ fail_case += simple_pattern_test(test_func);
+ fail_case += seeds_sizes_test(test_func);
+ fail_case += eob_test(test_func);
+ fail_case += update_test(test_func);
+ printf("Test %s done: %s\n", test_func->note, fail_case ? "Fail" : "Pass");
+
+ if (fail_case) {
+ printf("\n%s Failed %d tests\n", test_func->note, fail_case);
+ fail++;
+ }
+ }
+
+ printf("CRC64 Tests all done: %s\n", fail ? "Fail" : "Pass");
+
+ return fail;
+}
+
+// Test of all zeros
+int zeros_test(func_case_t * test_func)
+{
+ uint64_t crc, crc_ref;
+ int fail = 0;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc;
+ memset(buf, 0, MAX_BUF * 10);
+ crc = test_func->crc64_func_call(TEST_SEED, buf, MAX_BUF * 10);
+ crc_ref = test_func->crc64_ref_call(TEST_SEED, buf, MAX_BUF * 10);
+
+ if (crc != crc_ref) {
+ fail++;
+ printf("\n opt ref\n");
+ printf(" ------ ------\n");
+ printf("crc zero = 0x%16lx 0x%16lx \n", crc, crc_ref);
+ } else
+ printf(".");
+
+ return fail;
+}
+
+// Another simple test pattern
+int simple_pattern_test(func_case_t * test_func)
+{
+ uint64_t crc, crc_ref;
+ int fail = 0;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc;
+ memset(buf, 0x8a, MAX_BUF);
+ crc = test_func->crc64_func_call(TEST_SEED, buf, MAX_BUF);
+ crc_ref = test_func->crc64_ref_call(TEST_SEED, buf, MAX_BUF);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc all 8a = 0x%16lx 0x%16lx\n", crc, crc_ref);
+ else
+ printf(".");
+
+ return fail;
+}
+
+int seeds_sizes_test(func_case_t * test_func)
+{
+ uint64_t crc, crc_ref;
+ int fail = 0;
+ int i;
+ uint64_t r, s;
+ unsigned char *buf = NULL;
+
+ // Do a few random tests
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+ rand_buffer(buf, MAX_BUF * TEST_SIZE);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = test_func->crc64_func_call(r, buf, MAX_BUF);
+ crc_ref = test_func->crc64_ref_call(r, buf, MAX_BUF);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%16lx 0x%16lx\n", i, crc, crc_ref);
+ else
+ printf(".");
+ buf += MAX_BUF;
+ }
+
+ // Do a few random sizes
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+
+ for (i = MAX_BUF; i >= 0; i--) {
+ crc = test_func->crc64_func_call(r, buf, i);
+ crc_ref = test_func->crc64_ref_call(r, buf, i);
+ if (crc != crc_ref) {
+ fail++;
+ printf("fail random size%i 0x%16lx 0x%16lx\n", i, crc, crc_ref);
+ } else
+ printf(".");
+ }
+
+ // Try different seeds
+ for (s = 0; s < 20; s++) {
+ buf = (unsigned char *)buf_alloc; //reset buf
+
+ r = rand(); // just to get a new seed
+ rand_buffer(buf, MAX_BUF * TEST_SIZE); // new pseudo-rand data
+
+ if (verbose)
+ printf("seed = 0x%lx\n", r);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = test_func->crc64_func_call(r, buf, MAX_BUF);
+ crc_ref = test_func->crc64_ref_call(r, buf, MAX_BUF);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%16lx 0x%16lx\n", i, crc, crc_ref);
+ else
+ printf(".");
+ buf += MAX_BUF;
+ }
+ }
+
+ return fail;
+}
+
+// Run tests at end of buffer
+int eob_test(func_case_t * test_func)
+{
+ uint64_t crc, crc_ref;
+ int fail = 0;
+ int i;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc; //reset buf
+ buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = test_func->crc64_func_call(TEST_SEED, buf + i, TEST_SIZE - i);
+ crc_ref = test_func->crc64_ref_call(TEST_SEED, buf + i, TEST_SIZE - i);
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc eob rand%3d = 0x%16lx 0x%16lx\n", i, crc, crc_ref);
+ else
+ printf(".");
+ }
+
+ return fail;
+}
+
+int update_test(func_case_t * test_func)
+{
+ uint64_t crc, crc_ref;
+ int fail = 0;
+ int i;
+ uint64_t r;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+ // Process the whole buf with reference func single call.
+ crc_ref = test_func->crc64_ref_call(r, buf, MAX_BUF * TEST_SIZE);
+ // Process buf with update method.
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = test_func->crc64_func_call(r, buf, MAX_BUF);
+ // Update crc seeds and buf pointer.
+ r = crc;
+ buf += MAX_BUF;
+ }
+
+ if (crc != crc_ref)
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%16lx 0x%16lx\n", i, crc, crc_ref);
+ else
+ printf(".");
+
+ return fail;
+}
diff --git a/src/isa-l/crc/crc64_iso_norm_by8.asm b/src/isa-l/crc/crc64_iso_norm_by8.asm
new file mode 100644
index 00000000..1a4195d6
--- /dev/null
+++ b/src/isa-l/crc/crc64_iso_norm_by8.asm
@@ -0,0 +1,581 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; uint64_t crc64_iso_norm_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+align 16
+global crc64_iso_norm_by8:function
+crc64_iso_norm_by8:
+
+ not arg1 ;~init_crc
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 8
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x01 ; H*L
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+
+ movdqa xmm1, xmm7
+ pand xmm1, [mask3]
+ pclmulqdq xmm7, xmm10, 0x01
+ pxor xmm7, xmm1
+
+ pclmulqdq xmm7, xmm10, 0x11
+ pxor xmm7, xmm0
+ pextrq rax, xmm7, 0
+
+_cleanup:
+ not rax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Right shift (8-length) bytes in XMM
+ add rax, 8
+ movdqu xmm0, [rax]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+
+rk1:
+DQ 0x0000000000000145
+rk2:
+DQ 0x0000000000001db7
+rk3:
+DQ 0x000100000001001a
+rk4:
+DQ 0x001b0000001b015e
+rk5:
+DQ 0x0000000000000145
+rk6:
+DQ 0x0000000000000000
+rk7:
+DQ 0x000000000000001b
+rk8:
+DQ 0x000000000000001b
+rk9:
+DQ 0x0150145145145015
+rk10:
+DQ 0x1c71db6db6db71c7
+rk11:
+DQ 0x0001110110110111
+rk12:
+DQ 0x001aab1ab1ab1aab
+rk13:
+DQ 0x0000014445014445
+rk14:
+DQ 0x00001daab71daab7
+rk15:
+DQ 0x0000000101000101
+rk16:
+DQ 0x0000001b1b001b1b
+rk17:
+DQ 0x0000000001514515
+rk18:
+DQ 0x000000001c6db6c7
+rk19:
+DQ 0x0000000000011011
+rk20:
+DQ 0x00000000001ab1ab
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+mask3:
+dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_iso_norm_by8, 01, 00, 0020
diff --git a/src/isa-l/crc/crc64_iso_refl_by8.asm b/src/isa-l/crc/crc64_iso_refl_by8.asm
new file mode 100644
index 00000000..d7ed8ae3
--- /dev/null
+++ b/src/isa-l/crc/crc64_iso_refl_by8.asm
@@ -0,0 +1,544 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; uint64_t crc64_iso_refl_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+
+align 16
+global crc64_iso_refl_by8:function
+crc64_iso_refl_by8:
+ ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
+ not arg1
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm1, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm3, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm5, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm7, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+ ; xmm0 to xmm7
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+ ;xmm1 to xmm7
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+ ; xmm6 to xmm7
+ movdqa xmm10, [rk1]
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ add arg3, 16
+ je _128_done
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+
+
+ movdqa xmm2, xmm7
+ movdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ movdqu xmm0, [rax]
+
+
+ pshufb xmm7, xmm0
+ pxor xmm0, [mask3]
+ pshufb xmm2, xmm0
+
+ pblendvb xmm2, xmm1 ;xmm0 is implicit
+ ;;;;;;;;;;
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5]
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0
+ psrldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm1, xmm7
+ movdqa xmm10, [rk7]
+
+ pclmulqdq xmm7, xmm10, 0
+ movdqa xmm2, xmm7
+ pclmulqdq xmm7, xmm10, 0x10
+ pslldq xmm2, 8
+ pxor xmm7, xmm2
+ pxor xmm7, xmm1
+ pextrq rax, xmm7, 1
+
+_cleanup:
+ ; return c ^ 0xffffffff, ffffffffL;
+ not rax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movq xmm0, arg1 ; get the initial crc value
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax + r9]
+ pshufb xmm7,xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Left shift (8-length) bytes in XMM
+ movdqu xmm0, [rax + r9 + 8]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+; rk7 = floor(2^128/Q)
+; rk8 = Q
+rk1:
+DQ 0xf500000000000001
+rk2:
+DQ 0x6b70000000000001
+rk3:
+DQ 0xb001000000010000
+rk4:
+DQ 0xf501b0000001b000
+rk5:
+DQ 0xf500000000000001
+rk6:
+DQ 0x0000000000000000
+rk7:
+DQ 0xb000000000000001
+rk8:
+DQ 0xb000000000000000
+rk9:
+DQ 0xe014514514501501
+rk10:
+DQ 0x771db6db6db71c71
+rk11:
+DQ 0xa101101101110001
+rk12:
+DQ 0x1ab1ab1ab1aab001
+rk13:
+DQ 0xf445014445000001
+rk14:
+DQ 0x6aab71daab700001
+rk15:
+DQ 0xb100010100000001
+rk16:
+DQ 0x01b001b1b0000001
+rk17:
+DQ 0xe145150000000001
+rk18:
+DQ 0x76db6c7000000001
+rk19:
+DQ 0xa011000000000001
+rk20:
+DQ 0x1b1ab00000000001
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+
+mask:
+dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2:
+dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3:
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_iso_refl_by8, 01, 00, 0023
diff --git a/src/isa-l/crc/crc64_jones_norm_by8.asm b/src/isa-l/crc/crc64_jones_norm_by8.asm
new file mode 100644
index 00000000..0e5e75a2
--- /dev/null
+++ b/src/isa-l/crc/crc64_jones_norm_by8.asm
@@ -0,0 +1,581 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; uint64_t crc64_jones_norm_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+align 16
+global crc64_jones_norm_by8:function
+crc64_jones_norm_by8:
+
+ not arg1 ;~init_crc
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 8
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x01 ; H*L
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+
+ movdqa xmm1, xmm7
+ pand xmm1, [mask3]
+ pclmulqdq xmm7, xmm10, 0x01
+ pxor xmm7, xmm1
+
+ pclmulqdq xmm7, xmm10, 0x11
+ pxor xmm7, xmm0
+ pextrq rax, xmm7, 0
+
+_cleanup:
+ not rax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Right shift (8-length) bytes in XMM
+ add rax, 8
+ movdqu xmm0, [rax]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+
+rk1:
+DQ 0x4445ed2750017038
+rk2:
+DQ 0x698b74157cfbd736
+rk3:
+DQ 0x0cfcfb5101c4b775
+rk4:
+DQ 0x65403fd47cbec866
+rk5:
+DQ 0x4445ed2750017038
+rk6:
+DQ 0x0000000000000000
+rk7:
+DQ 0xddf3eeb298be6cf8
+rk8:
+DQ 0xad93d23594c935a9
+rk9:
+DQ 0xd8dc208e2ba527b4
+rk10:
+DQ 0xf032cfec76bb2bc5
+rk11:
+DQ 0xb536044f357f4238
+rk12:
+DQ 0xfdbf104d938ba67a
+rk13:
+DQ 0xeeddad9297a843e7
+rk14:
+DQ 0x3550bce629466473
+rk15:
+DQ 0x4e501e58ca43d25e
+rk16:
+DQ 0x13c961588f27f643
+rk17:
+DQ 0x3b60d00dcb1099bc
+rk18:
+DQ 0x44bf1f468c53b9a3
+rk19:
+DQ 0x96f2236e317179ee
+rk20:
+DQ 0xf00839aa0dd64bac
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+mask3:
+dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_jones_norm_by8, 01, 00, 0026
diff --git a/src/isa-l/crc/crc64_jones_refl_by8.asm b/src/isa-l/crc/crc64_jones_refl_by8.asm
new file mode 100644
index 00000000..39da6b82
--- /dev/null
+++ b/src/isa-l/crc/crc64_jones_refl_by8.asm
@@ -0,0 +1,544 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; uint64_t crc64_jones_refl_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+
+align 16
+global crc64_jones_refl_by8:function
+crc64_jones_refl_by8:
+ ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
+ not arg1
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm1, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm3, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm5, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm7, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+ ; xmm0 to xmm7
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+ ;xmm1 to xmm7
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+ ; xmm6 to xmm7
+ movdqa xmm10, [rk1]
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ add arg3, 16
+ je _128_done
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+
+
+ movdqa xmm2, xmm7
+ movdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ movdqu xmm0, [rax]
+
+
+ pshufb xmm7, xmm0
+ pxor xmm0, [mask3]
+ pshufb xmm2, xmm0
+
+ pblendvb xmm2, xmm1 ;xmm0 is implicit
+ ;;;;;;;;;;
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5]
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0
+ psrldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm1, xmm7
+ movdqa xmm10, [rk7]
+
+ pclmulqdq xmm7, xmm10, 0
+ movdqa xmm2, xmm7
+ pclmulqdq xmm7, xmm10, 0x10
+ pslldq xmm2, 8
+ pxor xmm7, xmm2
+ pxor xmm7, xmm1
+ pextrq rax, xmm7, 1
+
+_cleanup:
+ ; return c ^ 0xffffffff, ffffffffL;
+ not rax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movq xmm0, arg1 ; get the initial crc value
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax + r9]
+ pshufb xmm7,xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Left shift (8-length) bytes in XMM
+ movdqu xmm0, [rax + r9 + 8]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+; rk7 = floor(2^128/Q)
+; rk8 = Q
+rk1:
+DQ 0x381d0015c96f4444
+rk2:
+DQ 0xd9d7be7d505da32c
+rk3:
+DQ 0x768361524d29ed0b
+rk4:
+DQ 0xcc26fa7c57f8054c
+rk5:
+DQ 0x381d0015c96f4444
+rk6:
+DQ 0x0000000000000000
+rk7:
+DQ 0x3e6cfa329aef9f77
+rk8:
+DQ 0x2b5926535897936a
+rk9:
+DQ 0x5bc94ba8e2087636
+rk10:
+DQ 0x6cf09c8f37710b75
+rk11:
+DQ 0x3885fd59e440d95a
+rk12:
+DQ 0xbccba3936411fb7e
+rk13:
+DQ 0xe4dd0d81cbfce585
+rk14:
+DQ 0xb715e37b96ed8633
+rk15:
+DQ 0xf49784a634f014e4
+rk16:
+DQ 0xaf86efb16d9ab4fb
+rk17:
+DQ 0x7b3211a760160db8
+rk18:
+DQ 0xa062b2319d66692f
+rk19:
+DQ 0xef3d1d18ed889ed2
+rk20:
+DQ 0x6ba4d760ab38201e
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+
+mask:
+dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2:
+dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3:
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_jones_refl_by8, 01, 00, 0029
diff --git a/src/isa-l/crc/crc64_multibinary.asm b/src/isa-l/crc/crc64_multibinary.asm
new file mode 100644
index 00000000..a20c8a79
--- /dev/null
+++ b/src/isa-l/crc/crc64_multibinary.asm
@@ -0,0 +1,89 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;
+;;; uint64_t crc64_func(uint64_t init_crc, const unsigned char *buf, uint64_t len);
+;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern crc64_ecma_refl_by8
+extern crc64_ecma_refl_base
+
+extern crc64_ecma_norm_by8
+extern crc64_ecma_norm_base
+
+extern crc64_iso_refl_by8
+extern crc64_iso_refl_base
+
+extern crc64_iso_norm_by8
+extern crc64_iso_norm_base
+
+extern crc64_jones_refl_by8
+extern crc64_jones_refl_base
+
+extern crc64_jones_norm_by8
+extern crc64_jones_norm_base
+
+section .text
+
+%include "multibinary.asm"
+
+mbin_interface crc64_ecma_refl
+mbin_dispatch_init_clmul crc64_ecma_refl, crc64_ecma_refl_base, crc64_ecma_refl_by8
+mbin_interface crc64_ecma_norm
+mbin_dispatch_init_clmul crc64_ecma_norm, crc64_ecma_norm_base, crc64_ecma_norm_by8
+
+mbin_interface crc64_iso_refl
+mbin_dispatch_init_clmul crc64_iso_refl, crc64_iso_refl_base, crc64_iso_refl_by8
+mbin_interface crc64_iso_norm
+mbin_dispatch_init_clmul crc64_iso_norm, crc64_iso_norm_base, crc64_iso_norm_by8
+
+mbin_interface crc64_jones_refl
+mbin_dispatch_init_clmul crc64_jones_refl, crc64_jones_refl_base, crc64_jones_refl_by8
+mbin_interface crc64_jones_norm
+mbin_dispatch_init_clmul crc64_jones_norm, crc64_jones_norm_base, crc64_jones_norm_by8
+
+;;; func core, ver, snum
+slversion crc64_ecma_refl, 00, 00, 001b
+slversion crc64_ecma_norm, 00, 00, 0018
+slversion crc64_iso_refl, 00, 00, 0021
+slversion crc64_iso_norm, 00, 00, 001e
+slversion crc64_jones_refl, 00, 00, 0027
+slversion crc64_jones_norm, 00, 00, 0024
diff --git a/src/isa-l/crc/crc_base.c b/src/isa-l/crc/crc_base.c
new file mode 100644
index 00000000..ee14f059
--- /dev/null
+++ b/src/isa-l/crc/crc_base.c
@@ -0,0 +1,170 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include "crc.h"
+
+#define MAX_ITER 8
+
+uint32_t crc32_table_iscsi_base[256] = {
+ 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
+ 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+ 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
+ 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+ 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
+ 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+ 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
+ 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+ 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
+ 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+ 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
+ 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+ 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
+ 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+ 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
+ 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+ 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
+ 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+ 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
+ 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+ 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
+ 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+ 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
+ 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+ 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
+ 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+ 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
+ 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+ 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
+ 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+ 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
+ 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+ 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
+ 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+ 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
+ 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+ 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
+ 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+ 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
+ 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+ 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
+ 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+ 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
+ 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+ 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
+ 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+ 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
+ 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+ 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
+ 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+ 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
+ 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+ 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
+ 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+ 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
+ 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+ 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
+ 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+ 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
+ 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+ 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
+ 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+ 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
+ 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351,
+};
+
+// iSCSI CRC baseline function
+unsigned int crc32_iscsi_base(unsigned char *buffer, int len, unsigned int crc_init)
+{
+ unsigned int crc;
+ unsigned char *p_buf;
+
+ p_buf = (unsigned char *)buffer;
+ unsigned char *p_end = buffer + len;
+
+ crc = crc_init;
+
+ while (p_buf < (unsigned char *)p_end) {
+ crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF) ^ *p_buf++];
+ }
+ return crc;
+}
+
+// crc16_t10dif baseline function
+// Slow crc16 from the definition. Can be sped up with a lookup table.
+uint16_t crc16_t10dif_base(uint16_t seed, uint8_t * buf, uint64_t len)
+{
+ size_t rem = seed;
+ unsigned int i, j;
+
+ uint16_t poly = 0x8bb7; // t10dif standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (buf[i] << 8);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = rem << 1;
+ rem = (rem & 0x10000) ? rem ^ poly : rem;
+ }
+ }
+ return rem;
+}
+
+// crc32_ieee baseline function
+// Slow crc32 from the definition. Can be sped up with a lookup table.
+uint32_t crc32_ieee_base(uint32_t seed, uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint32_t poly = 0x04C11DB7; // IEEE standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (buf[i] << 24);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = rem << 1;
+ rem = (rem & 0x100000000ULL) ? rem ^ poly : rem;
+ }
+ }
+ return ~rem;
+}
+
+struct slver {
+ unsigned short snum;
+ unsigned char ver;
+ unsigned char core;
+};
+
+struct slver crc32_iscsi_base_slver_0001011d;
+struct slver crc32_iscsi_base_slver = { 0x011d, 0x02, 0x00 };
+
+struct slver crc16_t10dif_base_slver_0001011e;
+struct slver crc16_t10dif_base_slver = { 0x011e, 0x02, 0x00 };
+
+struct slver crc32_ieee_base_slver_0001011f;
+struct slver crc32_ieee_base_slver = { 0x011f, 0x02, 0x00 };
diff --git a/src/isa-l/crc/crc_base_aliases.c b/src/isa-l/crc/crc_base_aliases.c
new file mode 100644
index 00000000..63dfb306
--- /dev/null
+++ b/src/isa-l/crc/crc_base_aliases.c
@@ -0,0 +1,77 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "crc.h"
+#include "crc64.h"
+#include <stdint.h>
+
+unsigned int crc32_iscsi(unsigned char *buffer, int len, unsigned int crc_init)
+{
+ return crc32_iscsi_base(buffer, len, crc_init);
+}
+
+uint16_t crc16_t10dif(uint16_t seed, const unsigned char *buf, uint64_t len)
+{
+ return crc16_t10dif_base(seed, (uint8_t *) buf, len);
+}
+
+uint32_t crc32_ieee(uint32_t seed, const unsigned char *buf, uint64_t len)
+{
+ return crc32_ieee_base(seed, (uint8_t *) buf, len);
+}
+
+uint64_t crc64_ecma_refl(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_ecma_refl_base(seed, buf, len);
+}
+
+uint64_t crc64_ecma_norm(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_ecma_norm_base(seed, buf, len);
+}
+
+uint64_t crc64_iso_refl(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_iso_refl_base(seed, buf, len);
+}
+
+uint64_t crc64_iso_norm(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_iso_norm_base(seed, buf, len);
+}
+
+uint64_t crc64_jones_refl(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_jones_refl_base(seed, buf, len);
+}
+
+uint64_t crc64_jones_norm(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_jones_norm_base(seed, buf, len);
+}
diff --git a/src/isa-l/crc/crc_multibinary.asm b/src/isa-l/crc/crc_multibinary.asm
new file mode 100644
index 00000000..ae0c0498
--- /dev/null
+++ b/src/isa-l/crc/crc_multibinary.asm
@@ -0,0 +1,180 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern crc32_iscsi_00
+extern crc32_iscsi_01
+extern crc32_iscsi_base
+
+extern crc32_ieee_01
+extern crc32_ieee_by4 ;; Optimized for SLM
+extern crc32_ieee_base
+
+extern crc16_t10dif_01
+extern crc16_t10dif_by4 ;; Optimized for SLM
+extern crc16_t10dif_base
+
+section .data
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+crc32_iscsi_dispatched:
+ dq crc32_iscsi_mbinit
+
+crc32_ieee_dispatched:
+ dq crc32_ieee_mbinit
+
+crc16_t10dif_dispatched:
+ dq crc16_t10dif_mbinit
+
+section .text
+;;;;
+; crc32_iscsi multibinary function
+;;;;
+global crc32_iscsi:function
+crc32_iscsi_mbinit:
+ call crc32_iscsi_dispatch_init
+crc32_iscsi:
+ jmp qword [crc32_iscsi_dispatched]
+
+crc32_iscsi_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ lea rsi, [crc32_iscsi_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ lea rbx, [crc32_iscsi_00 WRT_OPT]
+ lea rax, [crc32_iscsi_01 WRT_OPT]
+
+ test ecx, FLAG_CPUID1_ECX_SSE4_2
+ cmovne rsi, rbx
+ test ecx, FLAG_CPUID1_ECX_CLMUL
+ cmovne rsi, rax
+ mov [crc32_iscsi_dispatched], rsi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+;;;;
+; crc32_ieee multibinary function
+;;;;
+global crc32_ieee:function
+crc32_ieee_mbinit:
+ call crc32_ieee_dispatch_init
+crc32_ieee:
+ jmp qword [crc32_ieee_dispatched]
+
+crc32_ieee_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ lea rsi, [crc32_ieee_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ lea rbx, [crc32_ieee_01 WRT_OPT]
+ lea rdx, [crc32_ieee_by4 WRT_OPT]
+
+ test ecx, FLAG_CPUID1_ECX_SSE3
+ jz use_ieee_base
+ test ecx, FLAG_CPUID1_ECX_CLMUL
+ cmovne rsi, rbx
+ and eax, FLAG_CPUID1_EAX_STEP_MASK
+ cmp eax, FLAG_CPUID1_EAX_AVOTON
+ cmove rsi, rdx
+use_ieee_base:
+ mov [crc32_ieee_dispatched], rsi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+;;;;
+; crc16_t10dif multibinary function
+;;;;
+global crc16_t10dif:function
+crc16_t10dif_mbinit:
+ call crc16_t10dif_dispatch_init
+crc16_t10dif:
+ jmp qword [crc16_t10dif_dispatched]
+
+crc16_t10dif_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ lea rsi, [crc16_t10dif_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ lea rbx, [crc16_t10dif_01 WRT_OPT]
+ lea rdx, [crc16_t10dif_by4 WRT_OPT]
+
+ test ecx, FLAG_CPUID1_ECX_SSE3
+ jz use_t10dif_base
+ test ecx, FLAG_CPUID1_ECX_CLMUL
+ cmovne rsi, rbx
+ and eax, FLAG_CPUID1_EAX_STEP_MASK
+ cmp eax, FLAG_CPUID1_EAX_AVOTON
+ cmove rsi, rdx
+use_t10dif_base:
+ mov [crc16_t10dif_dispatched], rsi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+;;; func core, ver, snum
+slversion crc16_t10dif, 00, 03, 011a
+slversion crc32_ieee, 00, 03, 011b
+slversion crc32_iscsi, 00, 03, 011c
diff --git a/src/isa-l/crc/crc_simple_test.c b/src/isa-l/crc/crc_simple_test.c
new file mode 100644
index 00000000..cac18f52
--- /dev/null
+++ b/src/isa-l/crc/crc_simple_test.c
@@ -0,0 +1,63 @@
+/**********************************************************************
+ Copyright(c) 2011-2013 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdint.h>
+#include "crc.h"
+
+const uint16_t init_crc_16 = 0x1234;
+const uint16_t t10_dif_expected = 0x60b3;
+const uint32_t init_crc_32 = 0x12345678;
+const uint32_t ieee_expected = 0x2ceadbe3;
+
+int main(void)
+{
+ unsigned char p_buf[48];
+ uint16_t t10_dif_computed;
+ uint32_t ieee_computed;
+ int i;
+
+ for (i = 0; i < 48; i++)
+ p_buf[i] = i;
+
+ t10_dif_computed = crc16_t10dif(init_crc_16, p_buf, 48);
+
+ if (t10_dif_computed != t10_dif_expected)
+ printf("WRONG CRC-16(T10 DIF) value\n");
+ else
+ printf("CORRECT CRC-16(T10 DIF) value\n");
+
+ ieee_computed = crc32_ieee(init_crc_32, p_buf, 48);
+
+ if (ieee_computed != ieee_expected)
+ printf("WRONG CRC-32(IEEE) value\n");
+ else
+ printf("CORRECT CRC-32(IEEE) value\n");
+
+ return 0;
+}