summaryrefslogtreecommitdiffstats
path: root/src/isa-l/crc
diff options
context:
space:
mode:
Diffstat (limited to 'src/isa-l/crc')
-rw-r--r--src/isa-l/crc/Makefile.am89
-rw-r--r--src/isa-l/crc/aarch64/Makefile.am57
-rw-r--r--src/isa-l/crc/aarch64/crc16_t10dif_copy_pmull.S423
-rw-r--r--src/isa-l/crc/aarch64/crc16_t10dif_pmull.S404
-rw-r--r--src/isa-l/crc/aarch64/crc32_aarch64_common.h321
-rw-r--r--src/isa-l/crc/aarch64/crc32_common_crc_ext_cortex_a72.S135
-rw-r--r--src/isa-l/crc/aarch64/crc32_common_mix_neoverse_n1.S432
-rw-r--r--src/isa-l/crc/aarch64/crc32_gzip_refl_3crc_fold.S95
-rw-r--r--src/isa-l/crc/aarch64/crc32_gzip_refl_crc_ext.S66
-rw-r--r--src/isa-l/crc/aarch64/crc32_gzip_refl_pmull.S33
-rw-r--r--src/isa-l/crc/aarch64/crc32_gzip_refl_pmull.h87
-rw-r--r--src/isa-l/crc/aarch64/crc32_ieee_norm_pmull.S33
-rw-r--r--src/isa-l/crc/aarch64/crc32_ieee_norm_pmull.h87
-rw-r--r--src/isa-l/crc/aarch64/crc32_iscsi_3crc_fold.S97
-rw-r--r--src/isa-l/crc/aarch64/crc32_iscsi_crc_ext.S65
-rw-r--r--src/isa-l/crc/aarch64/crc32_iscsi_refl_pmull.S53
-rw-r--r--src/isa-l/crc/aarch64/crc32_iscsi_refl_pmull.h87
-rw-r--r--src/isa-l/crc/aarch64/crc32_mix_default.S107
-rw-r--r--src/isa-l/crc/aarch64/crc32_mix_default_common.S563
-rw-r--r--src/isa-l/crc/aarch64/crc32_mix_neoverse_n1.S70
-rw-r--r--src/isa-l/crc/aarch64/crc32_norm_common_pmull.h135
-rw-r--r--src/isa-l/crc/aarch64/crc32_refl_common_pmull.h126
-rw-r--r--src/isa-l/crc/aarch64/crc32c_mix_default.S109
-rw-r--r--src/isa-l/crc/aarch64/crc32c_mix_neoverse_n1.S68
-rw-r--r--src/isa-l/crc/aarch64/crc64_ecma_norm_pmull.S33
-rw-r--r--src/isa-l/crc/aarch64/crc64_ecma_norm_pmull.h200
-rw-r--r--src/isa-l/crc/aarch64/crc64_ecma_refl_pmull.S33
-rw-r--r--src/isa-l/crc/aarch64/crc64_ecma_refl_pmull.h196
-rw-r--r--src/isa-l/crc/aarch64/crc64_iso_norm_pmull.S33
-rw-r--r--src/isa-l/crc/aarch64/crc64_iso_norm_pmull.h201
-rw-r--r--src/isa-l/crc/aarch64/crc64_iso_refl_pmull.S33
-rw-r--r--src/isa-l/crc/aarch64/crc64_iso_refl_pmull.h197
-rw-r--r--src/isa-l/crc/aarch64/crc64_jones_norm_pmull.S33
-rw-r--r--src/isa-l/crc/aarch64/crc64_jones_norm_pmull.h200
-rw-r--r--src/isa-l/crc/aarch64/crc64_jones_refl_pmull.S33
-rw-r--r--src/isa-l/crc/aarch64/crc64_jones_refl_pmull.h196
-rw-r--r--src/isa-l/crc/aarch64/crc64_norm_common_pmull.h129
-rw-r--r--src/isa-l/crc/aarch64/crc64_refl_common_pmull.h126
-rw-r--r--src/isa-l/crc/aarch64/crc_aarch64_dispatcher.c166
-rw-r--r--src/isa-l/crc/aarch64/crc_common_pmull.h302
-rw-r--r--src/isa-l/crc/aarch64/crc_multibinary_arm.S42
-rw-r--r--src/isa-l/crc/crc16_t10dif_01.asm666
-rw-r--r--src/isa-l/crc/crc16_t10dif_02.asm654
-rw-r--r--src/isa-l/crc/crc16_t10dif_by16_10.asm591
-rw-r--r--src/isa-l/crc/crc16_t10dif_by4.asm563
-rw-r--r--src/isa-l/crc/crc16_t10dif_copy_by4.asm599
-rw-r--r--src/isa-l/crc/crc16_t10dif_copy_by4_02.asm596
-rw-r--r--src/isa-l/crc/crc16_t10dif_copy_perf.c84
-rw-r--r--src/isa-l/crc/crc16_t10dif_copy_test.c175
-rw-r--r--src/isa-l/crc/crc16_t10dif_op_perf.c116
-rw-r--r--src/isa-l/crc/crc16_t10dif_perf.c79
-rw-r--r--src/isa-l/crc/crc16_t10dif_test.c179
-rw-r--r--src/isa-l/crc/crc32_funcs_test.c324
-rw-r--r--src/isa-l/crc/crc32_gzip_refl_by16_10.asm569
-rw-r--r--src/isa-l/crc/crc32_gzip_refl_by8.asm625
-rw-r--r--src/isa-l/crc/crc32_gzip_refl_by8_02.asm556
-rw-r--r--src/isa-l/crc/crc32_gzip_refl_perf.c91
-rw-r--r--src/isa-l/crc/crc32_ieee_01.asm656
-rw-r--r--src/isa-l/crc/crc32_ieee_02.asm652
-rw-r--r--src/isa-l/crc/crc32_ieee_by16_10.asm585
-rw-r--r--src/isa-l/crc/crc32_ieee_by4.asm566
-rw-r--r--src/isa-l/crc/crc32_ieee_perf.c79
-rw-r--r--src/isa-l/crc/crc32_iscsi_00.asm672
-rw-r--r--src/isa-l/crc/crc32_iscsi_01.asm592
-rw-r--r--src/isa-l/crc/crc32_iscsi_by16_10.asm556
-rw-r--r--src/isa-l/crc/crc32_iscsi_perf.c79
-rw-r--r--src/isa-l/crc/crc64_base.c912
-rw-r--r--src/isa-l/crc/crc64_ecma_norm_by16_10.asm61
-rw-r--r--src/isa-l/crc/crc64_ecma_norm_by8.asm584
-rw-r--r--src/isa-l/crc/crc64_ecma_refl_by16_10.asm61
-rw-r--r--src/isa-l/crc/crc64_ecma_refl_by8.asm549
-rw-r--r--src/isa-l/crc/crc64_example.c68
-rw-r--r--src/isa-l/crc/crc64_funcs_perf.c103
-rw-r--r--src/isa-l/crc/crc64_funcs_test.c315
-rw-r--r--src/isa-l/crc/crc64_iso_norm_by16_10.asm525
-rw-r--r--src/isa-l/crc/crc64_iso_norm_by8.asm582
-rw-r--r--src/isa-l/crc/crc64_iso_refl_by16_10.asm495
-rw-r--r--src/isa-l/crc/crc64_iso_refl_by8.asm545
-rw-r--r--src/isa-l/crc/crc64_jones_norm_by16_10.asm61
-rw-r--r--src/isa-l/crc/crc64_jones_norm_by8.asm582
-rw-r--r--src/isa-l/crc/crc64_jones_refl_by16_10.asm61
-rw-r--r--src/isa-l/crc/crc64_jones_refl_by8.asm545
-rw-r--r--src/isa-l/crc/crc64_multibinary.asm92
-rw-r--r--src/isa-l/crc/crc64_ref.h148
-rw-r--r--src/isa-l/crc/crc_base.c351
-rw-r--r--src/isa-l/crc/crc_base_aliases.c87
-rw-r--r--src/isa-l/crc/crc_multibinary.asm328
-rw-r--r--src/isa-l/crc/crc_ref.h140
-rw-r--r--src/isa-l/crc/crc_simple_test.c64
89 files changed, 24058 insertions, 0 deletions
diff --git a/src/isa-l/crc/Makefile.am b/src/isa-l/crc/Makefile.am
new file mode 100644
index 000000000..f12441c8d
--- /dev/null
+++ b/src/isa-l/crc/Makefile.am
@@ -0,0 +1,89 @@
+########################################################################
+# Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+include crc/aarch64/Makefile.am
+
+lsrc += \
+ crc/crc_base.c \
+ crc/crc64_base.c
+
+lsrc_base_aliases += crc/crc_base_aliases.c
+lsrc_x86_32 += crc/crc_base_aliases.c
+lsrc_ppc64le += crc/crc_base_aliases.c
+
+lsrc_x86_64 += \
+ crc/crc16_t10dif_01.asm \
+ crc/crc16_t10dif_by4.asm \
+ crc/crc16_t10dif_02.asm \
+ crc/crc16_t10dif_by16_10.asm \
+ crc/crc16_t10dif_copy_by4.asm \
+ crc/crc16_t10dif_copy_by4_02.asm \
+ crc/crc32_ieee_01.asm \
+ crc/crc32_ieee_02.asm \
+ crc/crc32_ieee_by4.asm \
+ crc/crc32_ieee_by16_10.asm \
+ crc/crc32_iscsi_01.asm \
+ crc/crc32_iscsi_00.asm \
+ crc/crc32_iscsi_by16_10.asm \
+ crc/crc_multibinary.asm \
+ crc/crc64_multibinary.asm \
+ crc/crc64_ecma_refl_by8.asm \
+ crc/crc64_ecma_refl_by16_10.asm \
+ crc/crc64_ecma_norm_by8.asm \
+ crc/crc64_ecma_norm_by16_10.asm \
+ crc/crc64_iso_refl_by8.asm \
+ crc/crc64_iso_refl_by16_10.asm \
+ crc/crc64_iso_norm_by8.asm \
+ crc/crc64_iso_norm_by16_10.asm \
+ crc/crc64_jones_refl_by8.asm \
+ crc/crc64_jones_refl_by16_10.asm \
+ crc/crc64_jones_norm_by8.asm \
+ crc/crc64_jones_norm_by16_10.asm \
+ crc/crc32_gzip_refl_by8.asm \
+ crc/crc32_gzip_refl_by8_02.asm \
+ crc/crc32_gzip_refl_by16_10.asm
+
+src_include += -I $(srcdir)/crc
+extern_hdrs += include/crc.h include/crc64.h
+
+other_src += include/reg_sizes.asm include/types.h include/test.h \
+ crc/crc_ref.h crc/crc64_ref.h
+
+check_tests += crc/crc16_t10dif_test \
+ crc/crc16_t10dif_copy_test \
+ crc/crc64_funcs_test \
+ crc/crc32_funcs_test
+
+perf_tests += crc/crc16_t10dif_perf crc/crc16_t10dif_copy_perf \
+ crc/crc16_t10dif_op_perf \
+ crc/crc32_ieee_perf crc/crc32_iscsi_perf \
+ crc/crc64_funcs_perf crc/crc32_gzip_refl_perf
+
+examples += crc/crc_simple_test crc/crc64_example
+
diff --git a/src/isa-l/crc/aarch64/Makefile.am b/src/isa-l/crc/aarch64/Makefile.am
new file mode 100644
index 000000000..5113b77e9
--- /dev/null
+++ b/src/isa-l/crc/aarch64/Makefile.am
@@ -0,0 +1,57 @@
+########################################################################
+# Copyright(c) 2020 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+lsrc_aarch64 += \
+ crc/aarch64/crc_multibinary_arm.S \
+ crc/aarch64/crc_aarch64_dispatcher.c
+
+lsrc_aarch64 += \
+ crc/aarch64/crc16_t10dif_pmull.S \
+ crc/aarch64/crc16_t10dif_copy_pmull.S \
+ crc/aarch64/crc32_ieee_norm_pmull.S \
+ crc/aarch64/crc64_ecma_refl_pmull.S \
+ crc/aarch64/crc64_ecma_norm_pmull.S \
+ crc/aarch64/crc64_iso_refl_pmull.S \
+ crc/aarch64/crc64_iso_norm_pmull.S \
+ crc/aarch64/crc64_jones_refl_pmull.S \
+ crc/aarch64/crc64_jones_norm_pmull.S
+
+#CRC32/CRC32C for micro-architecture
+lsrc_aarch64 += \
+ crc/aarch64/crc32_iscsi_refl_pmull.S \
+ crc/aarch64/crc32_gzip_refl_pmull.S \
+ crc/aarch64/crc32_iscsi_3crc_fold.S \
+ crc/aarch64/crc32_gzip_refl_3crc_fold.S \
+ crc/aarch64/crc32_iscsi_crc_ext.S \
+ crc/aarch64/crc32_gzip_refl_crc_ext.S \
+ crc/aarch64/crc32_mix_default.S \
+ crc/aarch64/crc32c_mix_default.S \
+ crc/aarch64/crc32_mix_neoverse_n1.S \
+ crc/aarch64/crc32c_mix_neoverse_n1.S
+
diff --git a/src/isa-l/crc/aarch64/crc16_t10dif_copy_pmull.S b/src/isa-l/crc/aarch64/crc16_t10dif_copy_pmull.S
new file mode 100644
index 000000000..10bf157c2
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc16_t10dif_copy_pmull.S
@@ -0,0 +1,423 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+ .arch armv8-a+crc+crypto
+ .text
+ .align 3
+ .global crc16_t10dif_copy_pmull
+ .type crc16_t10dif_copy_pmull, %function
+
+/* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */
+
+/* arguments */
+w_seed .req w0
+x_dst .req x1
+x_src .req x2
+x_len .req x3
+w_len .req w3
+
+/* returns */
+w_ret .req w0
+
+/* these as global temporary registers */
+w_tmp .req w6
+x_tmp .req x6
+x_tmp1 .req x7
+x_tmp2 .req x11
+
+d_tmp1 .req d0
+d_tmp2 .req d1
+q_tmp1 .req q0
+q_tmp2 .req q1
+v_tmp1 .req v0
+v_tmp2 .req v1
+
+/* local variables */
+w_counter .req w4
+w_crc .req w0
+x_crc .req x0
+x_counter .req x4
+x_crc16tab .req x5
+x_src_saved .req x0
+x_dst_saved .req x12
+
+crc16_t10dif_copy_pmull:
+ cmp x_len, 1023
+ sub sp, sp, #16
+ uxth w_seed, w_seed
+ bhi .crc_fold
+
+ mov x_tmp, 0
+ mov w_counter, 0
+
+.crc_table_loop_pre:
+ cmp x_len, x_tmp
+ bls .end
+
+ sxtw x_counter, w_counter
+ adrp x_crc16tab, .LANCHOR0
+ sub x_src, x_src, x_counter
+ sub x_dst, x_dst, x_counter
+ add x_crc16tab, x_crc16tab, :lo12:.LANCHOR0
+
+ .align 2
+.crc_table_loop:
+ ldrb w_tmp, [x_src, x_counter]
+ strb w_tmp, [x_dst, x_counter]
+ add x_counter, x_counter, 1
+ cmp x_len, x_counter
+ eor w_tmp, w_tmp, w_crc, lsr 8
+ ldrh w_tmp, [x_crc16tab, w_tmp, sxtw 1]
+ eor w_crc, w_tmp, w_crc, lsl 8
+ uxth w_crc, w_crc
+ bhi .crc_table_loop
+
+.end:
+ add sp, sp, 16
+ ret
+
+/* carry less multiplication, part1 - before loop */
+q_x0 .req q2
+q_x1 .req q3
+q_x2 .req q4
+q_x3 .req q5
+
+v_x0 .req v2
+v_x1 .req v3
+v_x2 .req v4
+v_x3 .req v5
+
+d_x0 .req d2
+d_x1 .req d3
+d_x2 .req d4
+d_x3 .req d5
+
+// the following registers only used this part1
+d_tmp3 .req d16
+v_tmp3 .req v16
+
+ .align 3
+.crc_fold:
+ fmov d_tmp1, x_crc
+ fmov d_tmp2, xzr
+ dup d_tmp3, v_tmp2.d[0]
+ shl d_tmp1, d_tmp1, 48
+ ins v_tmp3.d[1], v_tmp1.d[0]
+
+ and x_counter, x_len, -64
+ sub x_counter, x_counter, #64
+ cmp x_counter, 63
+ add x_src_saved, x_src, 64
+ add x_dst_saved, x_dst, 64
+
+ ldr q_x0, [x_src]
+ ldr q_x1, [x_src, 16]
+ ldr q_x2, [x_src, 32]
+ ldr q_x3, [x_src, 48]
+
+ str q_x0, [x_dst]
+ str q_x1, [x_dst, 16]
+ str q_x2, [x_dst, 32]
+ str q_x3, [x_dst, 48]
+
+ adrp x_tmp, .shuffle_mask_lanchor
+ ldr q7, [x_tmp, :lo12:.shuffle_mask_lanchor]
+
+ tbl v_tmp1.16b, {v_x0.16b}, v7.16b
+ eor v_x0.16b, v_tmp3.16b, v_tmp1.16b
+
+ tbl v_x1.16b, {v_x1.16b}, v7.16b
+ tbl v_x2.16b, {v_x2.16b}, v7.16b
+ tbl v_x3.16b, {v_x3.16b}, v7.16b
+ bls .crc_fold_loop_end
+
+/* carry less multiplication, part2 - loop */
+q_y0 .req q28
+q_y1 .req q29
+q_y2 .req q30
+q_y3 .req q31
+
+v_y0 .req v28
+v_y1 .req v29
+v_y2 .req v30
+v_y3 .req v31
+
+d_x0_h .req d24
+d_x0_l .req d2
+d_x1_h .req d25
+d_x1_l .req d3
+d_x2_h .req d26
+d_x2_l .req d4
+d_x3_h .req d27
+d_x3_l .req d5
+
+v_x0_h .req v24
+v_x0_l .req v2
+v_x1_h .req v25
+v_x1_l .req v3
+v_x2_h .req v26
+v_x2_l .req v4
+v_x3_h .req v27
+v_x3_l .req v5
+
+v_tmp1_x0 .req v24
+v_tmp1_x1 .req v25
+v_tmp1_x2 .req v26
+v_tmp1_x3 .req v27
+
+d_p4_h .req d19
+v_p4_h .req v19
+d_p4_l .req d17
+v_p4_l .req v17
+
+ mov x_tmp, 0x371d0000 /* p4 [1] */
+ fmov d_p4_h, x_tmp
+ mov x_tmp, 0x87e70000 /* p4 [0] */
+ fmov d_p4_l, x_tmp
+
+ .align 2
+.crc_fold_loop:
+ add x_src_saved, x_src_saved, 64
+ add x_dst_saved, x_dst_saved, 64
+
+ sub x_counter, x_counter, #64
+ cmp x_counter, 63
+
+ dup d_x0_h, v_x0.d[1]
+ dup d_x1_h, v_x1.d[1]
+ dup d_x2_h, v_x2.d[1]
+ dup d_x3_h, v_x3.d[1]
+
+ dup d_x0_l, v_x0.d[0]
+ dup d_x1_l, v_x1.d[0]
+ dup d_x2_l, v_x2.d[0]
+ dup d_x3_l, v_x3.d[0]
+
+ ldr q_y0, [x_src_saved, -64]
+ ldr q_y1, [x_src_saved, -48]
+ ldr q_y2, [x_src_saved, -32]
+ ldr q_y3, [x_src_saved, -16]
+
+ str q_y0, [x_dst_saved, -64]
+ str q_y1, [x_dst_saved, -48]
+ str q_y2, [x_dst_saved, -32]
+ str q_y3, [x_dst_saved, -16]
+
+ pmull v_x0_h.1q, v_x0_h.1d, v_p4_h.1d
+ pmull v_x0_l.1q, v_x0_l.1d, v_p4_l.1d
+ pmull v_x1_h.1q, v_x1_h.1d, v_p4_h.1d
+ pmull v_x1_l.1q, v_x1_l.1d, v_p4_l.1d
+ pmull v_x2_h.1q, v_x2_h.1d, v_p4_h.1d
+ pmull v_x2_l.1q, v_x2_l.1d, v_p4_l.1d
+ pmull v_x3_h.1q, v_x3_h.1d, v_p4_h.1d
+ pmull v_x3_l.1q, v_x3_l.1d, v_p4_l.1d
+
+ tbl v_y0.16b, {v_y0.16b}, v7.16b
+ tbl v_y1.16b, {v_y1.16b}, v7.16b
+ tbl v_y2.16b, {v_y2.16b}, v7.16b
+ tbl v_y3.16b, {v_y3.16b}, v7.16b
+
+ eor v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b
+ eor v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b
+ eor v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b
+ eor v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b
+
+ eor v_x0.16b, v_tmp1_x0.16b, v_y0.16b
+ eor v_x1.16b, v_tmp1_x1.16b, v_y1.16b
+ eor v_x2.16b, v_tmp1_x2.16b, v_y2.16b
+ eor v_x3.16b, v_tmp1_x3.16b, v_y3.16b
+
+ bhi .crc_fold_loop
+
+/* carry less multiplication, part3 - after loop */
+/* folding 512bit ---> 128bit */
+
+// input parameters:
+// v_x0 => v2
+// v_x1 => v3
+// v_x2 => v4
+// v_x3 => v5
+
+// v0, v1, v6, v30, are tmp registers
+
+.crc_fold_loop_end:
+ mov x_tmp, 0x4c1a0000 /* p1 [1] */
+ fmov d0, x_tmp
+ mov x_tmp, 0xfb0b0000 /* p1 [0] */
+ fmov d1, x_tmp
+
+ and w_counter, w_len, -64
+ sxtw x_tmp, w_counter
+
+ add x_src, x_src, x_tmp
+ add x_dst, x_dst, x_tmp
+
+ dup d6, v_x0.d[1]
+ dup d30, v_x0.d[0]
+ pmull v6.1q, v6.1d, v0.1d
+ pmull v30.1q, v30.1d, v1.1d
+ eor v6.16b, v6.16b, v30.16b
+ eor v_x1.16b, v6.16b, v_x1.16b
+
+ dup d6, v_x1.d[1]
+ dup d30, v_x1.d[0]
+ pmull v6.1q, v6.1d, v0.1d
+ pmull v16.1q, v30.1d, v1.1d
+ eor v6.16b, v6.16b, v16.16b
+ eor v_x2.16b, v6.16b, v_x2.16b
+
+ dup d_x0, v_x2.d[1]
+ dup d30, v_x2.d[0]
+ pmull v0.1q, v_x0.1d, v0.1d
+ pmull v_x0.1q, v30.1d, v1.1d
+ eor v1.16b, v0.16b, v_x0.16b
+ eor v_x0.16b, v1.16b, v_x3.16b
+
+/* carry less multiplication, part3 - after loop */
+/* crc16 fold function */
+d_16fold_p0_h .req d18
+v_16fold_p0_h .req v18
+
+d_16fold_p0_l .req d4
+v_16fold_p0_l .req v4
+
+v_16fold_from .req v_x0
+d_16fold_from_h .req d3
+v_16fold_from_h .req v3
+
+v_16fold_zero .req v7
+
+v_16fold_from1 .req v16
+
+v_16fold_from2 .req v0
+d_16fold_from2_h .req d6
+v_16fold_from2_h .req v6
+
+v_16fold_tmp .req v0
+
+ movi v_16fold_zero.4s, 0
+ mov x_tmp1, 0x2d560000 /* p0 [1] */
+ mov x_tmp2, 0x13680000 /* p0 [0] */
+
+ ext v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8
+ ext v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4
+
+ dup d_16fold_from_h, v_16fold_from.d[1]
+ fmov d_16fold_p0_h, x_tmp1
+ pmull v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d
+ eor v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b
+
+ dup d_16fold_from2_h, v_16fold_from2.d[1]
+ fmov d_16fold_p0_l, x_tmp2
+ pmull v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d
+ eor v_x0.16b, v0.16b, v6.16b
+
+/* carry less multiplication, part3 - after loop */
+/* crc16 barrett reduction function */
+
+// input parameters:
+// v_x0: v2
+// barrett reduction constant: br[0], br[1]
+
+d_br0 .req d3
+v_br0 .req v3
+d_br1 .req d5
+v_br1 .req v5
+
+ mov x_tmp1, 0x57f9 /* br[0] low */
+ movk x_tmp1, 0xf65a, lsl 16 /* br[0] high */
+ movk x_tmp1, 0x1, lsl 32
+ fmov d_br0, x_tmp1
+
+ dup d1, v_x0.d[0]
+ dup d1, v1.d[0]
+ ext v1.16b, v1.16b, v7.16b, #4
+ pmull v4.1q, v1.1d, v_br0.1d
+
+ ext v1.16b, v4.16b, v7.16b, #4
+ mov x_tmp1, 0x8bb70000 /* br[1] low */
+ movk x_tmp1, 0x1, lsl 32 /* br[1] high */
+
+ fmov d_br1, x_tmp1
+ pmull v_br1.1q, v1.1d, v_br1.1d
+ eor v_x0.16b, v_x0.16b, v_br1.16b
+
+ umov x0, v_x0.d[0]
+ ubfx x0, x0, 16, 16
+ b .crc_table_loop_pre
+
+ .size crc16_t10dif_copy_pmull, .-crc16_t10dif_copy_pmull
+
+ .section .rodata
+
+ .align 4
+.shuffle_mask_lanchor = . + 0
+ .type shuffle_mask, %object
+ .size shuffle_mask, 16
+shuffle_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8
+ .byte 7, 6, 5, 4, 3, 2, 1, 0
+
+ .align 4
+.LANCHOR0 = . + 0
+ .type crc16tab, %object
+ .size crc16tab, 512
+crc16tab:
+ .hword 0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b
+ .hword 0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6
+ .hword 0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6
+ .hword 0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b
+ .hword 0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1
+ .hword 0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c
+ .hword 0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c
+ .hword 0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781
+ .hword 0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8
+ .hword 0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255
+ .hword 0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925
+ .hword 0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698
+ .hword 0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472
+ .hword 0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf
+ .hword 0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf
+ .hword 0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02
+ .hword 0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda
+ .hword 0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067
+ .hword 0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17
+ .hword 0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa
+ .hword 0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640
+ .hword 0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd
+ .hword 0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d
+ .hword 0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30
+ .hword 0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759
+ .hword 0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4
+ .hword 0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394
+ .hword 0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29
+ .hword 0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3
+ .hword 0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e
+ .hword 0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e
+ .hword 0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3
diff --git a/src/isa-l/crc/aarch64/crc16_t10dif_pmull.S b/src/isa-l/crc/aarch64/crc16_t10dif_pmull.S
new file mode 100644
index 000000000..08f1a35ad
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc16_t10dif_pmull.S
@@ -0,0 +1,404 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+ .arch armv8-a+crc+crypto
+ .text
+ .align 3
+ .global crc16_t10dif_pmull
+ .type crc16_t10dif_pmull, %function
+
+/* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */
+
+/* arguments */
+w_seed .req w0
+x_buf .req x1
+x_len .req x2
+w_len .req w2
+
+/* returns */
+w_ret .req w0
+
+/* these as global temporary registers */
+w_tmp .req w5
+x_tmp .req x5
+x_tmp1 .req x6
+x_tmp2 .req x7
+
+d_tmp1 .req d0
+d_tmp2 .req d1
+q_tmp1 .req q0
+q_tmp2 .req q1
+v_tmp1 .req v0
+v_tmp2 .req v1
+
+/* local variables */
+w_counter .req w3
+w_crc .req w0
+x_crc .req x0
+x_counter .req x3
+x_crc16tab .req x4
+x_buf_saved .req x0
+
+crc16_t10dif_pmull:
+ cmp x_len, 1023
+ sub sp, sp, #16
+ uxth w_seed, w_seed
+ bhi .crc_fold
+
+ mov x_tmp, 0
+ mov w_counter, 0
+
+.crc_table_loop_pre:
+ cmp x_len, x_tmp
+ bls .end
+
+ sxtw x_counter, w_counter
+ adrp x_crc16tab, .LANCHOR0
+ sub x_buf, x_buf, x_counter
+ add x_crc16tab, x_crc16tab, :lo12:.LANCHOR0
+
+ .align 2
+.crc_table_loop:
+ ldrb w_tmp, [x_buf, x_counter]
+ add x_counter, x_counter, 1
+ cmp x_len, x_counter
+ eor w_tmp, w_tmp, w_crc, lsr 8
+ ldrh w_tmp, [x_crc16tab, w_tmp, sxtw 1]
+ eor w_crc, w_tmp, w_crc, lsl 8
+ uxth w_crc, w_crc
+ bhi .crc_table_loop
+
+.end:
+ add sp, sp, 16
+ ret
+
+/* carry less multiplication, part1 - before loop */
+q_x0 .req q2
+q_x1 .req q3
+q_x2 .req q4
+q_x3 .req q5
+
+v_x0 .req v2
+v_x1 .req v3
+v_x2 .req v4
+v_x3 .req v5
+
+d_x0 .req d2
+d_x1 .req d3
+d_x2 .req d4
+d_x3 .req d5
+
+// the following registers only used this part1
+d_tmp3 .req d16
+v_tmp3 .req v16
+
+ .align 3
+.crc_fold:
+ fmov d_tmp1, x_crc
+ fmov d_tmp2, xzr
+ dup d_tmp3, v_tmp2.d[0]
+ shl d_tmp1, d_tmp1, 48
+ ins v_tmp3.d[1], v_tmp1.d[0]
+
+ and x_counter, x_len, -64
+ sub x_counter, x_counter, #64
+ cmp x_counter, 63
+ add x_buf_saved, x_buf, 64
+
+ ldr q_x0, [x_buf]
+ ldr q_x1, [x_buf, 16]
+ ldr q_x2, [x_buf, 32]
+ ldr q_x3, [x_buf, 48]
+
+ adrp x_tmp, .shuffle_mask_lanchor
+ ldr q7, [x_tmp, :lo12:.shuffle_mask_lanchor]
+
+ tbl v_tmp1.16b, {v_x0.16b}, v7.16b
+ eor v_x0.16b, v_tmp3.16b, v_tmp1.16b
+
+ tbl v_x1.16b, {v_x1.16b}, v7.16b
+ tbl v_x2.16b, {v_x2.16b}, v7.16b
+ tbl v_x3.16b, {v_x3.16b}, v7.16b
+ bls .crc_fold_loop_end
+
+/* carry less multiplication, part2 - loop */
+q_y0 .req q28
+q_y1 .req q29
+q_y2 .req q30
+q_y3 .req q31
+
+v_y0 .req v28
+v_y1 .req v29
+v_y2 .req v30
+v_y3 .req v31
+
+d_x0_h .req d24
+d_x0_l .req d2
+d_x1_h .req d25
+d_x1_l .req d3
+d_x2_h .req d26
+d_x2_l .req d4
+d_x3_h .req d27
+d_x3_l .req d5
+
+v_x0_h .req v24
+v_x0_l .req v2
+v_x1_h .req v25
+v_x1_l .req v3
+v_x2_h .req v26
+v_x2_l .req v4
+v_x3_h .req v27
+v_x3_l .req v5
+
+v_tmp1_x0 .req v24
+v_tmp1_x1 .req v25
+v_tmp1_x2 .req v26
+v_tmp1_x3 .req v27
+
+d_p4_h .req d19
+v_p4_h .req v19
+d_p4_l .req d17
+v_p4_l .req v17
+
+ mov x_tmp, 0x371d0000 /* p4 [1] */
+ fmov d_p4_h, x_tmp
+ mov x_tmp, 0x87e70000 /* p4 [0] */
+ fmov d_p4_l, x_tmp
+
+ .align 2
+.crc_fold_loop:
+ add x_buf_saved, x_buf_saved, 64
+ sub x_counter, x_counter, #64
+ cmp x_counter, 63
+
+ dup d_x0_h, v_x0.d[1]
+ dup d_x1_h, v_x1.d[1]
+ dup d_x2_h, v_x2.d[1]
+ dup d_x3_h, v_x3.d[1]
+
+ dup d_x0_l, v_x0.d[0]
+ dup d_x1_l, v_x1.d[0]
+ dup d_x2_l, v_x2.d[0]
+ dup d_x3_l, v_x3.d[0]
+
+ ldr q_y0, [x_buf_saved, -64]
+ ldr q_y1, [x_buf_saved, -48]
+ ldr q_y2, [x_buf_saved, -32]
+ ldr q_y3, [x_buf_saved, -16]
+
+ pmull v_x0_h.1q, v_x0_h.1d, v_p4_h.1d
+ pmull v_x0_l.1q, v_x0_l.1d, v_p4_l.1d
+ pmull v_x1_h.1q, v_x1_h.1d, v_p4_h.1d
+ pmull v_x1_l.1q, v_x1_l.1d, v_p4_l.1d
+ pmull v_x2_h.1q, v_x2_h.1d, v_p4_h.1d
+ pmull v_x2_l.1q, v_x2_l.1d, v_p4_l.1d
+ pmull v_x3_h.1q, v_x3_h.1d, v_p4_h.1d
+ pmull v_x3_l.1q, v_x3_l.1d, v_p4_l.1d
+
+ tbl v_y0.16b, {v_y0.16b}, v7.16b
+ tbl v_y1.16b, {v_y1.16b}, v7.16b
+ tbl v_y2.16b, {v_y2.16b}, v7.16b
+ tbl v_y3.16b, {v_y3.16b}, v7.16b
+
+ eor v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b
+ eor v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b
+ eor v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b
+ eor v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b
+
+ eor v_x0.16b, v_tmp1_x0.16b, v_y0.16b
+ eor v_x1.16b, v_tmp1_x1.16b, v_y1.16b
+ eor v_x2.16b, v_tmp1_x2.16b, v_y2.16b
+ eor v_x3.16b, v_tmp1_x3.16b, v_y3.16b
+
+ bhi .crc_fold_loop
+
+/* carry less multiplication, part3 - after loop */
+/* folding 512bit ---> 128bit */
+
+// input parameters:
+// v_x0 => v2
+// v_x1 => v3
+// v_x2 => v4
+// v_x3 => v5
+
+// v0, v1, v6, v30, are tmp registers
+
+.crc_fold_loop_end:
+ mov x_tmp, 0x4c1a0000 /* p1 [1] */
+ fmov d0, x_tmp
+ mov x_tmp, 0xfb0b0000 /* p1 [0] */
+ fmov d1, x_tmp
+
+ and w_counter, w_len, -64
+ sxtw x_tmp, w_counter
+ add x_buf, x_buf, x_tmp
+
+ dup d6, v_x0.d[1]
+ dup d30, v_x0.d[0]
+ pmull v6.1q, v6.1d, v0.1d
+ pmull v30.1q, v30.1d, v1.1d
+ eor v6.16b, v6.16b, v30.16b
+ eor v_x1.16b, v6.16b, v_x1.16b
+
+ dup d6, v_x1.d[1]
+ dup d30, v_x1.d[0]
+ pmull v6.1q, v6.1d, v0.1d
+ pmull v16.1q, v30.1d, v1.1d
+ eor v6.16b, v6.16b, v16.16b
+ eor v_x2.16b, v6.16b, v_x2.16b
+
+ dup d_x0, v_x2.d[1]
+ dup d30, v_x2.d[0]
+ pmull v0.1q, v_x0.1d, v0.1d
+ pmull v_x0.1q, v30.1d, v1.1d
+ eor v1.16b, v0.16b, v_x0.16b
+ eor v_x0.16b, v1.16b, v_x3.16b
+
+/* carry less multiplication, part3 - after loop */
+/* crc16 fold function */
+d_16fold_p0_h .req d18
+v_16fold_p0_h .req v18
+
+d_16fold_p0_l .req d4
+v_16fold_p0_l .req v4
+
+v_16fold_from .req v_x0
+d_16fold_from_h .req d3
+v_16fold_from_h .req v3
+
+v_16fold_zero .req v7
+
+v_16fold_from1 .req v16
+
+v_16fold_from2 .req v0
+d_16fold_from2_h .req d6
+v_16fold_from2_h .req v6
+
+v_16fold_tmp .req v0
+
+ movi v_16fold_zero.4s, 0
+ mov x_tmp1, 0x2d560000 /* p0 [1] */
+ mov x_tmp2, 0x13680000 /* p0 [0] */
+
+ ext v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8
+ ext v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4
+
+ dup d_16fold_from_h, v_16fold_from.d[1]
+ fmov d_16fold_p0_h, x_tmp1
+ pmull v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d
+ eor v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b
+
+ dup d_16fold_from2_h, v_16fold_from2.d[1]
+ fmov d_16fold_p0_l, x_tmp2
+ pmull v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d
+ eor v_x0.16b, v0.16b, v6.16b
+
+/* carry less multiplication, part3 - after loop */
+/* crc16 barrett reduction function */
+
+// input parameters:
+// v_x0: v2
+// barrett reduction constant: br[0], br[1]
+
+d_br0 .req d3
+v_br0 .req v3
+d_br1 .req d5
+v_br1 .req v5
+
+ mov x_tmp1, 0x57f9 /* br[0] low */
+ movk x_tmp1, 0xf65a, lsl 16 /* br[0] high */
+ movk x_tmp1, 0x1, lsl 32
+ fmov d_br0, x_tmp1
+
+ dup d1, v_x0.d[0]
+ dup d1, v1.d[0]
+ ext v1.16b, v1.16b, v7.16b, #4
+ pmull v4.1q, v1.1d, v_br0.1d
+
+ ext v1.16b, v4.16b, v7.16b, #4
+ mov x_tmp1, 0x8bb70000 /* br[1] low */
+ movk x_tmp1, 0x1, lsl 32 /* br[1] high */
+
+ fmov d_br1, x_tmp1
+ pmull v_br1.1q, v1.1d, v_br1.1d
+ eor v_x0.16b, v_x0.16b, v_br1.16b
+
+ umov x0, v_x0.d[0]
+ ubfx x0, x0, 16, 16
+ b .crc_table_loop_pre
+
+ .size crc16_t10dif_pmull, .-crc16_t10dif_pmull
+
+ .section .rodata
+
+ .align 4
+.shuffle_mask_lanchor = . + 0
+ .type shuffle_mask, %object
+ .size shuffle_mask, 16
+shuffle_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8
+ .byte 7, 6, 5, 4, 3, 2, 1, 0
+
+ .align 4
+.LANCHOR0 = . + 0
+ .type crc16tab, %object
+ .size crc16tab, 512
+crc16tab:
+ .hword 0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b
+ .hword 0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6
+ .hword 0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6
+ .hword 0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b
+ .hword 0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1
+ .hword 0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c
+ .hword 0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c
+ .hword 0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781
+ .hword 0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8
+ .hword 0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255
+ .hword 0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925
+ .hword 0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698
+ .hword 0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472
+ .hword 0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf
+ .hword 0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf
+ .hword 0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02
+ .hword 0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda
+ .hword 0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067
+ .hword 0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17
+ .hword 0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa
+ .hword 0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640
+ .hword 0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd
+ .hword 0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d
+ .hword 0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30
+ .hword 0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759
+ .hword 0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4
+ .hword 0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394
+ .hword 0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29
+ .hword 0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3
+ .hword 0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e
+ .hword 0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e
+ .hword 0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3
diff --git a/src/isa-l/crc/aarch64/crc32_aarch64_common.h b/src/isa-l/crc/aarch64/crc32_aarch64_common.h
new file mode 100644
index 000000000..a2ef22aea
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_aarch64_common.h
@@ -0,0 +1,321 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+
+
+.macro crc32_hw_common poly_type
+
+.ifc \poly_type,crc32
+ mvn wCRC,wCRC
+.endif
+ cbz LEN, .zero_length_ret
+ tbz BUF, 0, .align_short
+ ldrb wdata,[BUF],1
+ sub LEN,LEN,1
+ crc32_u8 wCRC,wCRC,wdata
+.align_short:
+ tst BUF,2
+ ccmp LEN,1,0,ne
+ bhi .align_short_2
+ tst BUF,4
+ ccmp LEN,3,0,ne
+ bhi .align_word
+
+.align_finish:
+
+ cmp LEN, 63
+ bls .loop_16B
+.loop_64B:
+ ldp data0, data1, [BUF],#16
+ prfm pldl2keep,[BUF,2048]
+ sub LEN,LEN,#64
+ ldp data2, data3, [BUF],#16
+ prfm pldl1keep,[BUF,256]
+ cmp LEN,#64
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ ldp data0, data1, [BUF],#16
+ crc32_u64 wCRC, wCRC, data2
+ crc32_u64 wCRC, wCRC, data3
+ ldp data2, data3, [BUF],#16
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ crc32_u64 wCRC, wCRC, data2
+ crc32_u64 wCRC, wCRC, data3
+ bge .loop_64B
+
+.loop_16B:
+ cmp LEN, 15
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16
+ cmp LEN,15
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16
+ cmp LEN,15
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16 //MUST less than 16B
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+.less_16B:
+ cmp LEN, 7
+ bls .less_8B
+ ldr data0, [BUF], 8
+ sub LEN, LEN, #8
+ crc32_u64 wCRC, wCRC, data0
+.less_8B:
+ cmp LEN, 3
+ bls .less_4B
+ ldr wdata, [BUF], 4
+ sub LEN, LEN, #4
+ crc32_u32 wCRC, wCRC, wdata
+.less_4B:
+ cmp LEN, 1
+ bls .less_2B
+ ldrh wdata, [BUF], 2
+ sub LEN, LEN, #2
+ crc32_u16 wCRC, wCRC, wdata
+.less_2B:
+ cbz LEN, .zero_length_ret
+ ldrb wdata, [BUF]
+ crc32_u8 wCRC, wCRC, wdata
+.zero_length_ret:
+.ifc \poly_type,crc32
+ mvn w0, wCRC
+.else
+ mov w0, wCRC
+.endif
+ ret
+.align_short_2:
+ ldrh wdata, [BUF], 2
+ sub LEN, LEN, 2
+ tst BUF, 4
+ crc32_u16 wCRC, wCRC, wdata
+ ccmp LEN, 3, 0, ne
+ bls .align_finish
+.align_word:
+ ldr wdata, [BUF], 4
+ sub LEN, LEN, #4
+ crc32_u32 wCRC, wCRC, wdata
+ b .align_finish
+.endm
+
+.macro crc32_3crc_fold poly_type
+.ifc \poly_type,crc32
+ mvn wCRC,wCRC
+.endif
+ cbz LEN, .zero_length_ret
+ tbz BUF, 0, .align_short
+ ldrb wdata,[BUF],1
+ sub LEN,LEN,1
+ crc32_u8 wCRC,wCRC,wdata
+.align_short:
+ tst BUF,2
+ ccmp LEN,1,0,ne
+ bhi .align_short_2
+ tst BUF,4
+ ccmp LEN,3,0,ne
+ bhi .align_word
+
+.align_finish:
+ cmp LEN,1023
+ adr const_adr, .Lconstants
+ bls 1f
+ ldp dconst0,dconst1,[const_adr]
+2:
+ ldr crc0_data0,[ptr_crc0],8
+ prfm pldl2keep,[ptr_crc0,3*1024-8]
+ mov crc1,0
+ mov crc2,0
+ add ptr_crc1,ptr_crc0,336
+ add ptr_crc2,ptr_crc0,336*2
+ crc32_u64 crc0,crc0,crc0_data0
+ .set offset,0
+ .set ptr_offset,8
+ .rept 5
+ ldp crc0_data0,crc0_data1,[ptr_crc0],16
+ ldp crc1_data0,crc1_data1,[ptr_crc1],16
+ .set offset,offset+64
+ .set ptr_offset,ptr_offset+16
+ prfm pldl2keep,[ptr_crc0,3*1024-ptr_offset+offset]
+ crc32_u64 crc0,crc0,crc0_data0
+ crc32_u64 crc0,crc0,crc0_data1
+ ldp crc2_data0,crc2_data1,[ptr_crc2],16
+ crc32_u64 crc1,crc1,crc1_data0
+ crc32_u64 crc1,crc1,crc1_data1
+ crc32_u64 crc2,crc2,crc2_data0
+ crc32_u64 crc2,crc2,crc2_data1
+ .endr
+ .set l1_offset,0
+ .rept 10
+ ldp crc0_data0,crc0_data1,[ptr_crc0],16
+ ldp crc1_data0,crc1_data1,[ptr_crc1],16
+ .set offset,offset+64
+ .set ptr_offset,ptr_offset+16
+ prfm pldl2keep,[ptr_crc0,3*1024-ptr_offset+offset]
+ prfm pldl1keep,[ptr_crc0,2*1024-ptr_offset+l1_offset]
+ .set l1_offset,l1_offset+64
+ crc32_u64 crc0,crc0,crc0_data0
+ crc32_u64 crc0,crc0,crc0_data1
+ ldp crc2_data0,crc2_data1,[ptr_crc2],16
+ crc32_u64 crc1,crc1,crc1_data0
+ crc32_u64 crc1,crc1,crc1_data1
+ crc32_u64 crc2,crc2,crc2_data0
+ crc32_u64 crc2,crc2,crc2_data1
+ .endr
+
+ .rept 6
+ ldp crc0_data0,crc0_data1,[ptr_crc0],16
+ ldp crc1_data0,crc1_data1,[ptr_crc1],16
+ .set ptr_offset,ptr_offset+16
+ prfm pldl1keep,[ptr_crc0,2*1024-ptr_offset+l1_offset]
+ .set l1_offset,l1_offset+64
+ crc32_u64 crc0,crc0,crc0_data0
+ crc32_u64 crc0,crc0,crc0_data1
+ ldp crc2_data0,crc2_data1,[ptr_crc2],16
+ crc32_u64 crc1,crc1,crc1_data0
+ crc32_u64 crc1,crc1,crc1_data1
+ crc32_u64 crc2,crc2,crc2_data0
+ crc32_u64 crc2,crc2,crc2_data1
+ .endr
+ ldr crc2_data0,[ptr_crc2]
+ fmov dtmp0,xcrc0
+ fmov dtmp1,xcrc1
+ crc32_u64 crc2,crc2,crc2_data0
+ add ptr_crc0,ptr_crc0,1024-(336+8)
+ pmull vtmp0.1q,vtmp0.1d,vconst0.1d
+ sub LEN,LEN,1024
+ pmull vtmp1.1q,vtmp1.1d,vconst1.1d
+ cmp LEN,1024
+ fmov xcrc0,dtmp0
+ fmov xcrc1,dtmp1
+ crc32_u64 crc0,wzr,xcrc0
+ crc32_u64 crc1,wzr,xcrc1
+
+ eor crc0,crc0,crc2
+ eor crc0,crc0,crc1
+
+ bhs 2b
+1:
+ cmp LEN, 63
+ bls .loop_16B
+.loop_64B:
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#64
+ ldp data2, data3, [BUF],#16
+ cmp LEN,#64
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ ldp data0, data1, [BUF],#16
+ crc32_u64 wCRC, wCRC, data2
+ crc32_u64 wCRC, wCRC, data3
+ ldp data2, data3, [BUF],#16
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ crc32_u64 wCRC, wCRC, data2
+ crc32_u64 wCRC, wCRC, data3
+ bge .loop_64B
+
+.loop_16B:
+ cmp LEN, 15
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16
+ cmp LEN,15
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16
+ cmp LEN,15
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16 //MUST less than 16B
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+.less_16B:
+ cmp LEN, 7
+ bls .less_8B
+ ldr data0, [BUF], 8
+ sub LEN, LEN, #8
+ crc32_u64 wCRC, wCRC, data0
+.less_8B:
+ cmp LEN, 3
+ bls .less_4B
+ ldr wdata, [BUF], 4
+ sub LEN, LEN, #4
+ crc32_u32 wCRC, wCRC, wdata
+.less_4B:
+ cmp LEN, 1
+ bls .less_2B
+ ldrh wdata, [BUF], 2
+ sub LEN, LEN, #2
+ crc32_u16 wCRC, wCRC, wdata
+.less_2B:
+ cbz LEN, .zero_length_ret
+ ldrb wdata, [BUF]
+ crc32_u8 wCRC, wCRC, wdata
+.zero_length_ret:
+.ifc \poly_type,crc32
+ mvn w0, wCRC
+.else
+ mov w0, wCRC
+.endif
+ ret
+.align_short_2:
+ ldrh wdata, [BUF], 2
+ sub LEN, LEN, 2
+ tst BUF, 4
+ crc32_u16 wCRC, wCRC, wdata
+ ccmp LEN, 3, 0, ne
+ bls .align_finish
+.align_word:
+ ldr wdata, [BUF], 4
+ sub LEN, LEN, #4
+ crc32_u32 wCRC, wCRC, wdata
+ b .align_finish
+.Lconstants:
+.ifc \poly_type,crc32
+ .quad 0xb486819b
+ .quad 0x76278617
+.else
+ .quad 0xe417f38a
+ .quad 0x8f158014
+.endif
+
+.endm
diff --git a/src/isa-l/crc/aarch64/crc32_common_crc_ext_cortex_a72.S b/src/isa-l/crc/aarch64/crc32_common_crc_ext_cortex_a72.S
new file mode 100644
index 000000000..7c9ca35ad
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_common_crc_ext_cortex_a72.S
@@ -0,0 +1,135 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+
+
+.macro crc32_hw_common poly_type
+ cbz LEN, .zero_length_ret
+.ifc \poly_type,crc32
+ mvn wCRC,wCRC
+.endif
+ tbz BUF, 0, .align_short
+ ldrb wdata,[BUF],1
+ sub LEN,LEN,1
+ crc32_u8 wCRC,wCRC,wdata
+.align_short:
+ tst BUF,2
+ ccmp LEN,1,0,ne
+ bhi .align_short_2
+ tst BUF,4
+ ccmp LEN,3,0,ne
+ bhi .align_word
+
+.align_finish:
+
+ cmp LEN, 63
+ bls .loop_16B
+.loop_64B:
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#64
+ ldp data2, data3, [BUF],#16
+ cmp LEN,#64
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ ldp data0, data1, [BUF],#16
+ crc32_u64 wCRC, wCRC, data2
+ crc32_u64 wCRC, wCRC, data3
+ ldp data2, data3, [BUF],#16
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ crc32_u64 wCRC, wCRC, data2
+ crc32_u64 wCRC, wCRC, data3
+ bge .loop_64B
+
+.loop_16B:
+ cmp LEN, 15
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16
+ cmp LEN,15
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16
+ cmp LEN,15
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16 //MUST less than 16B
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+.less_16B:
+ cmp LEN, 7
+ bls .less_8B
+ ldr data0, [BUF], 8
+ sub LEN, LEN, #8
+ crc32_u64 wCRC, wCRC, data0
+.less_8B:
+ cmp LEN, 3
+ bls .less_4B
+ ldr wdata, [BUF], 4
+ sub LEN, LEN, #4
+ crc32_u32 wCRC, wCRC, wdata
+.less_4B:
+ cmp LEN, 1
+ bls .less_2B
+ ldrh wdata, [BUF], 2
+ sub LEN, LEN, #2
+ crc32_u16 wCRC, wCRC, wdata
+.less_2B:
+ cbz LEN, .finish_exit
+ ldrb wdata, [BUF]
+ crc32_u8 wCRC, wCRC, wdata
+.finish_exit:
+.ifc \poly_type,crc32
+ mvn w0, wCRC
+.else
+ mov w0, wCRC
+.endif
+ ret
+.zero_length_ret:
+ mov w0, wCRC
+ ret
+.align_short_2:
+ ldrh wdata, [BUF], 2
+ sub LEN, LEN, 2
+ tst BUF, 4
+ crc32_u16 wCRC, wCRC, wdata
+ ccmp LEN, 3, 0, ne
+ bls .align_finish
+.align_word:
+ ldr wdata, [BUF], 4
+ sub LEN, LEN, #4
+ crc32_u32 wCRC, wCRC, wdata
+ b .align_finish
+
+.endm
diff --git a/src/isa-l/crc/aarch64/crc32_common_mix_neoverse_n1.S b/src/isa-l/crc/aarch64/crc32_common_mix_neoverse_n1.S
new file mode 100644
index 000000000..4911a30b8
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_common_mix_neoverse_n1.S
@@ -0,0 +1,432 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+ \name\()_d .req d\reg
+.endm
+ declare_var_vector_reg k1k2,20
+ declare_var_vector_reg k3k4,21
+ declare_var_vector_reg poly,22
+ declare_var_vector_reg k5k0,23
+ declare_var_vector_reg mask,24
+ declare_var_vector_reg fold_poly,25
+
+ declare_var_vector_reg tmp0,0
+ declare_var_vector_reg tmp1,1
+ declare_var_vector_reg tmp2,2
+ declare_var_vector_reg tmp3,3
+ declare_var_vector_reg tmp4,4
+ declare_var_vector_reg tmp5,5
+ declare_var_vector_reg tmp6,6
+ declare_var_vector_reg tmp7,7
+ declare_var_vector_reg pmull_data0,16
+ declare_var_vector_reg pmull_data1,17
+ declare_var_vector_reg pmull_data2,18
+ declare_var_vector_reg pmull_data3,19
+
+ vzr .req v26
+
+ const_addr .req x3
+ crc_blk_ptr .req x4
+ pmull_blk_ptr .req x5
+ crc_data0 .req x6
+ crc_data1 .req x7
+ crc_data2 .req x9
+ crc_data3 .req x10
+ wPmull .req w11
+ xPmull .req x11
+
+ data0 .req x4
+ data1 .req x5
+ data2 .req x6
+ data3 .req x7
+ wdata .req w4
+
+.macro pmull_fold
+
+ pmull2 tmp4_v.1q, tmp0_v.2d, k1k2_v.2d
+ pmull2 tmp5_v.1q, tmp1_v.2d, k1k2_v.2d
+ pmull2 tmp6_v.1q, tmp2_v.2d, k1k2_v.2d
+ pmull2 tmp7_v.1q, tmp3_v.2d, k1k2_v.2d
+
+ pmull tmp0_v.1q, tmp0_v.1d, k1k2_v.1d
+ pmull tmp1_v.1q, tmp1_v.1d, k1k2_v.1d
+ pmull tmp2_v.1q, tmp2_v.1d, k1k2_v.1d
+ pmull tmp3_v.1q, tmp3_v.1d, k1k2_v.1d
+ ld1 {pmull_data0_v.16b-pmull_data3_v.16b},[pmull_blk_ptr],#64
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+
+ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b
+ eor tmp1_v.16b, tmp1_v.16b, tmp5_v.16b
+ eor tmp2_v.16b, tmp2_v.16b, tmp6_v.16b
+ eor tmp3_v.16b, tmp3_v.16b, tmp7_v.16b
+
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, v16.16b
+ eor tmp1_v.16b, tmp1_v.16b, v17.16b
+ eor tmp2_v.16b, tmp2_v.16b, v18.16b
+ eor tmp3_v.16b, tmp3_v.16b, v19.16b
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+.endm
+
+
+
+.macro crc32_common_mix poly_type
+ .set MIX_BLK_SIZE,2048
+
+.ifc \poly_type,crc32
+ mvn wCRC,wCRC
+.endif
+ cmp LEN,MIX_BLK_SIZE-1
+ adr const_addr, .Lconstants
+ bls start_final
+ ld1 {k1k2_v.16b,k3k4_v.16b,poly_v.16b},[const_addr],#48
+ movi vzr.16b, #0
+ ld1 {k5k0_v.8b,mask_v.8b,fold_poly_v.8b},[const_addr]
+
+loop_2048:
+ ld1 {tmp0_v.16b-tmp3_v.16b}, [BUF]
+ add pmull_blk_ptr,BUF,0x40
+ add crc_blk_ptr, BUF,512
+ mov tmp4_v.16b,vzr.16b
+ fmov tmp4_s, wCRC
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ eor tmp0_v.16b,tmp0_v.16b,tmp4_v.16b
+ mov wCRC, 0
+ sub LEN,LEN,MIX_BLK_SIZE
+ cmp LEN,MIX_BLK_SIZE
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+
+ pmull_fold
+ pmull_fold
+ pmull_fold
+ pmull_fold
+ pmull_fold
+ pmull_fold
+ pmull_fold
+
+ /* Folding cache line into 128bit */
+ pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, tmp2_v.16b
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, tmp3_v.16b
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+
+
+ /**
+ * perform the last 64 bit fold, also
+ * adds 32 zeroes to the input stream
+ */
+ ext tmp1_v.16b, tmp0_v.16b, tmp0_v.16b, #8
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ pmull2 tmp1_v.1q, tmp1_v.2d, k3k4_v.2d
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ ext tmp0_v.16b, tmp0_v.16b, vzr.16b, #8
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+
+ /* final 32-bit fold */
+ ext tmp1_v.16b, tmp0_v.16b, vzr.16b, #4
+ and tmp0_v.16b, tmp0_v.16b, mask_v.16b
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ pmull tmp0_v.1q, tmp0_v.1d, k5k0_v.1d
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b
+
+ /**
+ * Finish up with the bit-reversed barrett
+ * reduction 64 ==> 32 bits
+ */
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ and tmp1_v.16b, tmp0_v.16b, mask_v.16b
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ ext tmp1_v.16b, vzr.16b, tmp1_v.16b, #8
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ pmull2 tmp1_v.1q, tmp1_v.2d, poly_v.2d
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ and tmp1_v.16b, tmp1_v.16b, mask_v.16b
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ pmull tmp1_v.1q, tmp1_v.1d, poly_v.1d
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ mov tmp4_v.16b,vzr.16b
+ mov tmp4_v.s[0], tmp0_v.s[1]
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ ldp crc_data0,crc_data1,[crc_blk_ptr],16
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+ ldp crc_data2,crc_data3,[crc_blk_ptr],16
+
+ crc32_u64 wCRC,wCRC,crc_data0
+ crc32_u64 wCRC,wCRC,crc_data1
+ crc32_u64 wCRC,wCRC,crc_data2
+ crc32_u64 wCRC,wCRC,crc_data3
+
+ pmull tmp4_v.1q, tmp4_v.1d, fold_poly_v.1d
+ add BUF,BUF,MIX_BLK_SIZE
+ fmov xPmull, tmp4_d
+ crc32_u64 wPmull, wzr, xPmull
+ eor wCRC, wPmull, wCRC
+ bge loop_2048
+start_final:
+ cmp LEN, 63
+ bls .loop_16B
+.loop_64B:
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#64
+ ldp data2, data3, [BUF],#16
+ cmp LEN,#64
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ ldp data0, data1, [BUF],#16
+ crc32_u64 wCRC, wCRC, data2
+ crc32_u64 wCRC, wCRC, data3
+ ldp data2, data3, [BUF],#16
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ crc32_u64 wCRC, wCRC, data2
+ crc32_u64 wCRC, wCRC, data3
+ bge .loop_64B
+
+.loop_16B:
+ cmp LEN, 15
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16
+ cmp LEN,15
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16
+ cmp LEN,15
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+ bls .less_16B
+ ldp data0, data1, [BUF],#16
+ sub LEN,LEN,#16 //MUST less than 16B
+ crc32_u64 wCRC, wCRC, data0
+ crc32_u64 wCRC, wCRC, data1
+.less_16B:
+ cmp LEN, 7
+ bls .less_8B
+ ldr data0, [BUF], 8
+ sub LEN, LEN, #8
+ crc32_u64 wCRC, wCRC, data0
+.less_8B:
+ cmp LEN, 3
+ bls .less_4B
+ ldr wdata, [BUF], 4
+ sub LEN, LEN, #4
+ crc32_u32 wCRC, wCRC, wdata
+.less_4B:
+ cmp LEN, 1
+ bls .less_2B
+ ldrh wdata, [BUF], 2
+ sub LEN, LEN, #2
+ crc32_u16 wCRC, wCRC, wdata
+.less_2B:
+ cbz LEN, .finish_exit
+ ldrb wdata, [BUF]
+ crc32_u8 wCRC, wCRC, wdata
+.finish_exit:
+.ifc \poly_type,crc32
+ mvn w0, wCRC
+.else
+ mov w0, wCRC
+.endif
+ ret
+.endm
+
diff --git a/src/isa-l/crc/aarch64/crc32_gzip_refl_3crc_fold.S b/src/isa-l/crc/aarch64/crc32_gzip_refl_3crc_fold.S
new file mode 100644
index 000000000..116d62cc9
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_gzip_refl_3crc_fold.S
@@ -0,0 +1,95 @@
+########################################################################
+# Copyright(c) 2020 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+#include "crc32_aarch64_common.h"
+ .text
+ .align 6
+ .arch armv8-a+crc+crypto
+.macro crc32_u64 dst,src,data
+ crc32x \dst,\src,\data
+.endm
+.macro crc32_u32 dst,src,data
+ crc32w \dst,\src,\data
+.endm
+.macro crc32_u16 dst,src,data
+ crc32h \dst,\src,\data
+.endm
+.macro crc32_u8 dst,src,data
+ crc32b \dst,\src,\data
+.endm
+.macro declare_var_vector_reg name:req,reg:req
+ q\name .req q\reg
+ v\name .req v\reg
+ s\name .req s\reg
+ d\name .req d\reg
+.endm
+
+ BUF .req x1
+ ptr_crc0 .req x1
+ LEN .req x2
+ wCRC .req w0
+ crc0 .req w0
+ xcrc0 .req x0
+
+ crc1 .req w3
+ crc2 .req w4
+ xcrc1 .req x3
+ const_adr .req x3
+ ptr_crc1 .req x6
+ ptr_crc2 .req x7
+ crc0_data0 .req x9
+ crc0_data1 .req x10
+ crc1_data0 .req x11
+ crc1_data1 .req x12
+ crc2_data0 .req x13
+ crc2_data1 .req x14
+
+ wdata .req w3
+ data0 .req x3
+ data1 .req x4
+ data2 .req x5
+ data3 .req x6
+
+ declare_var_vector_reg tmp0,0
+ declare_var_vector_reg tmp1,1
+ declare_var_vector_reg const0,2
+ declare_var_vector_reg const1,3
+
+/**
+ uint32_t crc32_gzip_refl(
+ uint32_t wCRC,
+ const unsigned char *BUF,
+ uint64_t LEN
+ );
+*/
+
+ .global crc32_gzip_refl_3crc_fold
+ .type crc32_gzip_refl_3crc_fold, %function
+crc32_gzip_refl_3crc_fold:
+ crc32_3crc_fold crc32
+ .size crc32_gzip_refl_3crc_fold, .-crc32_gzip_refl_3crc_fold
diff --git a/src/isa-l/crc/aarch64/crc32_gzip_refl_crc_ext.S b/src/isa-l/crc/aarch64/crc32_gzip_refl_crc_ext.S
new file mode 100644
index 000000000..8e3d227be
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_gzip_refl_crc_ext.S
@@ -0,0 +1,66 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .text
+ .align 6
+ .arch armv8-a+crc
+
+
+#include "crc32_aarch64_common.h"
+
+ BUF .req x1
+ LEN .req x2
+ wCRC .req w0
+ data0 .req x4
+ data1 .req x5
+ data2 .req x6
+ data3 .req x7
+ wdata .req w3
+.macro crc32_u64 dst,src,data
+ crc32x \dst,\src,\data
+.endm
+.macro crc32_u32 dst,src,data
+ crc32w \dst,\src,\data
+.endm
+.macro crc32_u16 dst,src,data
+ crc32h \dst,\src,\data
+.endm
+.macro crc32_u8 dst,src,data
+ crc32b \dst,\src,\data
+.endm
+
+ /**
+ * uint32_t crc32_gzip_refl_crc_ext(const unsigned char *BUF,
+ * uint64_t LEN,uint32_t wCRC);
+ */
+ .global crc32_gzip_refl_crc_ext
+ .type crc32_gzip_refl_crc_ext, %function
+crc32_gzip_refl_crc_ext:
+ crc32_hw_common crc32
+ .size crc32_gzip_refl_crc_ext, .-crc32_gzip_refl_crc_ext
diff --git a/src/isa-l/crc/aarch64/crc32_gzip_refl_pmull.S b/src/isa-l/crc/aarch64/crc32_gzip_refl_pmull.S
new file mode 100644
index 000000000..d52e2d8f5
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_gzip_refl_pmull.S
@@ -0,0 +1,33 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc32_gzip_refl_pmull.h"
+#include "crc32_refl_common_pmull.h"
+
+crc32_refl_func crc32_gzip_refl_pmull
diff --git a/src/isa-l/crc/aarch64/crc32_gzip_refl_pmull.h b/src/isa-l/crc/aarch64/crc32_gzip_refl_pmull.h
new file mode 100644
index 000000000..883567d97
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_gzip_refl_pmull.h
@@ -0,0 +1,87 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.equ p4_low_b0, 0x2d95
+.equ p4_low_b1, 0x8f35
+.equ p4_high_b0, 0x13d7
+.equ p4_high_b1, 0x1d95
+.equ p1_low_b0, 0x9191
+.equ p1_low_b1, 0xae68
+.equ p1_high_b0, 0x009e
+.equ p1_high_b1, 0xccaa
+.equ p0_low_b0, 0x6765
+.equ p0_low_b1, 0xb8bc
+.equ p0_high_b0, p1_high_b0
+.equ p0_high_b1, p1_high_b1
+.equ br_low_b0, 0x0641
+.equ br_low_b1, 0xdb71
+.equ br_low_b2, 0x1
+.equ br_high_b0, 0x1641
+.equ br_high_b1, 0xf701
+.equ br_high_b2, 0x1
+
+ .text
+ .section .rodata
+ .align 4
+ .set .lanchor_crc_tab,. + 0
+ .type crc32_table_gzip_refl, %object
+ .size crc32_table_gzip_refl, 1024
+crc32_table_gzip_refl:
+ .word 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3
+ .word 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91
+ .word 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7
+ .word 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5
+ .word 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b
+ .word 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59
+ .word 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f
+ .word 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d
+ .word 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433
+ .word 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01
+ .word 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457
+ .word 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65
+ .word 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb
+ .word 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9
+ .word 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f
+ .word 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad
+ .word 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683
+ .word 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1
+ .word 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7
+ .word 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5
+ .word 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b
+ .word 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79
+ .word 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f
+ .word 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d
+ .word 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713
+ .word 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21
+ .word 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777
+ .word 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45
+ .word 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db
+ .word 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9
+ .word 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf
+ .word 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
diff --git a/src/isa-l/crc/aarch64/crc32_ieee_norm_pmull.S b/src/isa-l/crc/aarch64/crc32_ieee_norm_pmull.S
new file mode 100644
index 000000000..32966fb9d
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_ieee_norm_pmull.S
@@ -0,0 +1,33 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc32_ieee_norm_pmull.h"
+#include "crc32_norm_common_pmull.h"
+
+crc32_norm_func crc32_ieee_norm_pmull
diff --git a/src/isa-l/crc/aarch64/crc32_ieee_norm_pmull.h b/src/isa-l/crc/aarch64/crc32_ieee_norm_pmull.h
new file mode 100644
index 000000000..67acd2a03
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_ieee_norm_pmull.h
@@ -0,0 +1,87 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.equ p4_low_b0, 0x8b11
+.equ p4_low_b1, 0xe622
+.equ p4_high_b0, 0x794c
+.equ p4_high_b1, 0x8833
+.equ p1_low_b0, 0x5605
+.equ p1_low_b1, 0xe8a4
+.equ p1_high_b0, 0xcd4c
+.equ p1_high_b1, 0xc5b9
+.equ p0_low_b0, 0x678d
+.equ p0_low_b1, 0x490d
+.equ p0_high_b0, 0xaa66
+.equ p0_high_b1, 0xf200
+.equ br_low_b0, 0x01df
+.equ br_low_b1, 0x04d1
+.equ br_low_b2, 0x1
+.equ br_high_b0, 0x1db7
+.equ br_high_b1, 0x04c1
+.equ br_high_b2, 0x1
+
+ .text
+ .section .rodata
+ .align 4
+ .set .lanchor_crc_tab,. + 0
+ .type crc32_table_ieee_norm, %object
+ .size crc32_table_ieee_norm, 1024
+crc32_table_ieee_norm:
+ .word 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005
+ .word 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd
+ .word 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75
+ .word 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd
+ .word 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039, 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5
+ .word 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d
+ .word 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95
+ .word 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d
+ .word 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072
+ .word 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca
+ .word 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02
+ .word 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba
+ .word 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692
+ .word 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a
+ .word 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2
+ .word 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a
+ .word 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637, 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb
+ .word 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53
+ .word 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b
+ .word 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623
+ .word 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b
+ .word 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3
+ .word 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b
+ .word 0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3
+ .word 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c
+ .word 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24
+ .word 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec
+ .word 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654
+ .word 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c
+ .word 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4
+ .word 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c
+ .word 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
diff --git a/src/isa-l/crc/aarch64/crc32_iscsi_3crc_fold.S b/src/isa-l/crc/aarch64/crc32_iscsi_3crc_fold.S
new file mode 100644
index 000000000..2beaa80c7
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_iscsi_3crc_fold.S
@@ -0,0 +1,97 @@
+########################################################################
+# Copyright(c) 2020 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+
+ .text
+ .align 6
+ .arch armv8-a+crc+crypto
+#include "crc32_aarch64_common.h"
+.macro crc32_u64 dst,src,data
+ crc32cx \dst,\src,\data
+.endm
+.macro crc32_u32 dst,src,data
+ crc32cw \dst,\src,\data
+.endm
+.macro crc32_u16 dst,src,data
+ crc32ch \dst,\src,\data
+.endm
+.macro crc32_u8 dst,src,data
+ crc32cb \dst,\src,\data
+.endm
+.macro declare_var_vector_reg name:req,reg:req
+ q\name .req q\reg
+ v\name .req v\reg
+ s\name .req s\reg
+ d\name .req d\reg
+.endm
+
+ BUF .req x0
+ LEN .req x1
+ wCRC .req w2
+ crc0 .req w2
+ crc1 .req w3
+ crc2 .req w4
+ xcrc0 .req x2
+ xcrc1 .req x3
+ const_adr .req x3
+ ptr_crc0 .req x0
+ ptr_crc1 .req x6
+ ptr_crc2 .req x7
+ crc0_data0 .req x9
+ crc0_data1 .req x10
+ crc1_data0 .req x11
+ crc1_data1 .req x12
+ crc2_data0 .req x13
+ crc2_data1 .req x14
+
+ wdata .req w3
+ data0 .req x3
+ data1 .req x4
+ data2 .req x5
+ data3 .req x6
+
+ declare_var_vector_reg tmp0,0
+ declare_var_vector_reg tmp1,1
+ declare_var_vector_reg const0,2
+ declare_var_vector_reg const1,3
+
+/**
+ unsigned int crc32_iscsi(
+ unsigned char *BUF,
+ int LEN,
+ unsigned int wCRC
+ );
+
+*/
+
+ .global crc32_iscsi_3crc_fold
+ .type crc32_iscsi_3crc_fold, %function
+crc32_iscsi_3crc_fold:
+ crc32_3crc_fold crc32c
+ .size crc32_iscsi_3crc_fold, .-crc32_iscsi_3crc_fold
diff --git a/src/isa-l/crc/aarch64/crc32_iscsi_crc_ext.S b/src/isa-l/crc/aarch64/crc32_iscsi_crc_ext.S
new file mode 100644
index 000000000..359401a52
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_iscsi_crc_ext.S
@@ -0,0 +1,65 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .text
+ .align 6
+ .arch armv8-a+crc
+
+
+#include "crc32_aarch64_common.h"
+ BUF .req x0
+ LEN .req x1
+ wCRC .req w2
+ data0 .req x4
+ data1 .req x5
+ data2 .req x6
+ data3 .req x7
+ wdata .req w3
+.macro crc32_u64 dst,src,data
+ crc32cx \dst,\src,\data
+.endm
+.macro crc32_u32 dst,src,data
+ crc32cw \dst,\src,\data
+.endm
+.macro crc32_u16 dst,src,data
+ crc32ch \dst,\src,\data
+.endm
+.macro crc32_u8 dst,src,data
+ crc32cb \dst,\src,\data
+.endm
+
+ /**
+ * uint32_t crc32_iscsi_crc_ext(const unsigned char *BUF,
+ * uint64_t LEN,uint32_t wCRC);
+ */
+ .global crc32_iscsi_crc_ext
+ .type crc32_iscsi_crc_ext, %function
+crc32_iscsi_crc_ext:
+ crc32_hw_common crc32c
+ .size crc32_iscsi_crc_ext, .-crc32_iscsi_crc_ext
diff --git a/src/isa-l/crc/aarch64/crc32_iscsi_refl_pmull.S b/src/isa-l/crc/aarch64/crc32_iscsi_refl_pmull.S
new file mode 100644
index 000000000..09a88e2e1
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_iscsi_refl_pmull.S
@@ -0,0 +1,53 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc32_iscsi_refl_pmull.h"
+#include "crc32_refl_common_pmull.h"
+
+crc32_refl_func crc32_iscsi_refl_pmull_internal
+
+ .arch armv8-a+crc+crypto
+ .text
+ .align 3
+ .global crc32_iscsi_refl_pmull
+ .type crc32_iscsi_refl_pmull, %function
+crc32_iscsi_refl_pmull:
+ stp x29, x30, [sp, -32]!
+ mov x29, sp
+
+ mov w7, w2
+ sxtw x2, w1
+ mov x1, x0
+ mov w0, w7
+ mvn w0, w0
+
+ bl crc32_iscsi_refl_pmull_internal
+ mvn w0, w0
+ ldp x29, x30, [sp], 32
+ ret
diff --git a/src/isa-l/crc/aarch64/crc32_iscsi_refl_pmull.h b/src/isa-l/crc/aarch64/crc32_iscsi_refl_pmull.h
new file mode 100644
index 000000000..c17b91be3
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_iscsi_refl_pmull.h
@@ -0,0 +1,87 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.equ p4_low_b0, 0xef02
+.equ p4_low_b1, 0x740e
+.equ p4_high_b0, 0xddf8
+.equ p4_high_b1, 0x9e4a
+.equ p1_low_b0, 0x0dfe
+.equ p1_low_b1, 0xf20c
+.equ p1_high_b0, 0x7d27
+.equ p1_high_b1, 0x493c
+.equ p0_low_b0, 0xaab8
+.equ p0_low_b1, 0xdd45
+.equ p0_high_b0, p1_high_b0
+.equ p0_high_b1, p1_high_b1
+.equ br_low_b0, 0x76f1
+.equ br_low_b1, 0x05ec
+.equ br_low_b2, 0x1
+.equ br_high_b0, 0x13f1
+.equ br_high_b1, 0xdea7
+.equ br_high_b2, 0x0
+
+ .text
+ .section .rodata
+ .align 4
+ .set .lanchor_crc_tab,. + 0
+ .type crc32_table_iscsi_refl, %object
+ .size crc32_table_iscsi_refl, 1024
+crc32_table_iscsi_refl:
+ .word 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB
+ .word 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24
+ .word 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384
+ .word 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B
+ .word 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35
+ .word 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA
+ .word 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A
+ .word 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595
+ .word 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957
+ .word 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198
+ .word 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38
+ .word 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7
+ .word 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789
+ .word 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46
+ .word 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6
+ .word 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829
+ .word 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93
+ .word 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C
+ .word 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC
+ .word 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033
+ .word 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D
+ .word 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982
+ .word 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622
+ .word 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED
+ .word 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F
+ .word 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0
+ .word 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540
+ .word 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F
+ .word 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1
+ .word 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E
+ .word 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E
+ .word 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351
diff --git a/src/isa-l/crc/aarch64/crc32_mix_default.S b/src/isa-l/crc/aarch64/crc32_mix_default.S
new file mode 100644
index 000000000..05c34074d
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_mix_default.S
@@ -0,0 +1,107 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a+crypto+crc
+ .text
+ .align 6
+
+#define CRC32
+
+.macro crc32_u64 dst,src,data
+ crc32x \dst,\src,\data
+.endm
+
+.macro crc32_u32 dst,src,data
+ crc32w \dst,\src,\data
+.endm
+
+.macro crc32_u16 dst,src,data
+ crc32h \dst,\src,\data
+.endm
+
+.macro crc32_u8 dst,src,data
+ crc32b \dst,\src,\data
+.endm
+
+#include "crc32_mix_default_common.S"
+
+ .global crc32_mix_default
+ .type crc32_mix_default, %function
+crc32_mix_default:
+ crc32_mix_main_default
+ .size crc32_mix_default, .-crc32_mix_default
+
+ .section .rodata
+ .align 4
+ .set lanchor_crc32,. + 0
+
+ .type k1k2, %object
+ .size k1k2, 16
+k1k2:
+ .xword 0x0154442bd4
+ .xword 0x01c6e41596
+
+ .type k3k4, %object
+ .size k3k4, 16
+k3k4:
+ .xword 0x01751997d0
+ .xword 0x00ccaa009e
+
+ .type k5k0, %object
+ .size k5k0, 16
+k5k0:
+ .xword 0x0163cd6124
+ .xword 0
+
+ .type poly, %object
+ .size poly, 16
+poly:
+ .xword 0x01db710641
+ .xword 0x01f7011641
+
+ .type crc32_const, %object
+ .size crc32_const, 48
+crc32_const:
+ .xword 0x1753ab84
+ .xword 0
+ .xword 0xbbf2f6d6
+ .xword 0
+ .xword 0x0c30f51d
+ .xword 0
+
+ .align 4
+ .set .lanchor_mask,. + 0
+
+ .type mask, %object
+ .size mask, 16
+mask:
+ .word -1
+ .word 0
+ .word -1
+ .word 0
diff --git a/src/isa-l/crc/aarch64/crc32_mix_default_common.S b/src/isa-l/crc/aarch64/crc32_mix_default_common.S
new file mode 100644
index 000000000..106da209a
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_mix_default_common.S
@@ -0,0 +1,563 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+.macro declare_generic_reg name:req, reg:req, default:req
+ \name .req \default\reg
+ w_\name .req w\reg
+ x_\name .req x\reg
+.endm
+
+.macro declare_neon_reg name:req, reg:req, default:req
+ \name .req \default\reg
+ v_\name .req v\reg
+ q_\name .req q\reg
+ d_\name .req d\reg
+ s_\name .req s\reg
+.endm
+
+/**********************************************************************
+ variables
+**********************************************************************/
+ declare_generic_reg crc, 0,w
+ declare_generic_reg buf, 1,x
+ declare_generic_reg len, 2,x
+ declare_generic_reg buf_saved, 3,x
+ declare_generic_reg buf_iter, 4,x
+ declare_generic_reg len_saved, 5,x
+ declare_generic_reg buf_tmp, 6,x
+
+ declare_generic_reg crc0, 7,x
+ declare_generic_reg crc1, 8,x
+ declare_generic_reg crc2, 9,x
+ declare_generic_reg pconst, 10,x
+ declare_generic_reg data_crc0, 11,x
+ declare_generic_reg data_crc1, 12,x
+ declare_generic_reg data_crc2, 13,x
+
+ declare_generic_reg size, 9,x
+ declare_generic_reg crc_tmp, 10,w
+ declare_generic_reg size_tmp, 11,x
+ declare_generic_reg data_tmp1, 11,x
+ declare_generic_reg data_tmp2, 12,x
+ declare_generic_reg data_tmp3, 13,x
+
+ declare_generic_reg tmp, 14,x
+ declare_generic_reg tmp1, 15,x
+
+// return
+ declare_generic_reg ret_crc, 0,w
+
+/**********************************************************************
+ simd variables
+**********************************************************************/
+ declare_neon_reg a0, 0,v
+ declare_neon_reg a1, 1,v
+ declare_neon_reg a2, 2,v
+ declare_neon_reg a3, 3,v
+ declare_neon_reg a4, 4,v
+
+ declare_neon_reg a5, 16,v
+ declare_neon_reg a6, 17,v
+ declare_neon_reg a7, 18,v
+ declare_neon_reg a8, 19,v
+
+ declare_neon_reg y5, 20,v
+ declare_neon_reg y6, 21,v
+ declare_neon_reg y7, 22,v
+ declare_neon_reg y8, 23,v
+
+ declare_neon_reg neon_zero, 24,v
+ declare_neon_reg neon_tmp, 24,v
+
+ declare_neon_reg k5k0, 25,v
+ declare_neon_reg neon_tmp1, 26,v
+ declare_neon_reg neon_tmp2, 27,v
+ declare_neon_reg neon_tmp3, 28,v
+
+ declare_neon_reg crc_pmull, 29,v
+ declare_neon_reg neon_crc0, 30,v
+ declare_neon_reg neon_crc1, 31,v
+
+ declare_neon_reg neon_const0, 5,v
+ declare_neon_reg neon_const1, 6,v
+ declare_neon_reg neon_const2, 7,v
+
+// constants
+ .equ offset_k3k4, 16
+ .equ offset_k5k0, 32
+ .equ offset_poly, 48
+ .equ offset_crc32_const, 64
+
+// pmull fold
+.macro pmull_fold
+ ldr x_data_crc0, [x_buf_tmp, 464]
+ ldr x_data_crc1, [x_buf_tmp, 976]
+ ldr x_data_crc2, [x_buf_tmp, 1488]
+
+ pmull v_a5.1q, v_a1.1d, v_a0.1d
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ ldr x_data_crc0, [x_buf_tmp, 472]
+ ldr x_data_crc1, [x_buf_tmp, 984]
+ ldr x_data_crc2, [x_buf_tmp, 1496]
+
+ pmull v_a6.1q, v_a2.1d, v_a0.1d
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ ldr x_data_crc0, [x_buf_tmp, 480]
+ ldr x_data_crc1, [x_buf_tmp, 992]
+ ldr x_data_crc2, [x_buf_tmp, 1504]
+
+ pmull v_a7.1q, v_a3.1d, v_a0.1d
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ ldr x_data_crc0, [x_buf_tmp, 488]
+ ldr x_data_crc1, [x_buf_tmp, 1000]
+ ldr x_data_crc2, [x_buf_tmp, 1512]
+
+ pmull v_a8.1q, v_a4.1d, v_a0.1d
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ ldr x_data_crc0, [x_buf_tmp, 496]
+ ldr x_data_crc1, [x_buf_tmp, 1008]
+ ldr x_data_crc2, [x_buf_tmp, 1520]
+
+ pmull2 v_a1.1q, v_a1.2d, v_a0.2d
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ ld1 {v_y5.4s, v_y6.4s, v_y7.4s, v_y8.4s}, [x_buf_tmp]
+
+ ldr x_data_crc0, [x_buf_tmp, 504]
+ ldr x_data_crc1, [x_buf_tmp, 1016]
+ ldr x_data_crc2, [x_buf_tmp, 1528]
+
+ pmull2 v_a2.1q, v_a2.2d, v_a0.2d
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ pmull2 v_a3.1q, v_a3.2d, v_a0.2d
+ pmull2 v_a4.1q, v_a4.2d, v_a0.2d
+
+ eor v_y5.16b, v_y5.16b, v_a5.16b
+ eor v_y6.16b, v_y6.16b, v_a6.16b
+ eor v_y7.16b, v_y7.16b, v_a7.16b
+ eor v_y8.16b, v_y8.16b, v_a8.16b
+
+ ldr x_data_crc0, [x_buf_tmp, 512]
+ ldr x_data_crc1, [x_buf_tmp, 1024]
+ ldr x_data_crc2, [x_buf_tmp, 1536]
+
+ eor v_a1.16b, v_y5.16b, v_a1.16b
+ eor v_a2.16b, v_y6.16b, v_a2.16b
+ eor v_a3.16b, v_y7.16b, v_a3.16b
+ eor v_a4.16b, v_y8.16b, v_a4.16b
+
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ ldr x_data_crc0, [x_buf_tmp, 520]
+ ldr x_data_crc1, [x_buf_tmp, 1032]
+ ldr x_data_crc2, [x_buf_tmp, 1544]
+
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+.endm
+
+// crc32 mix for 2048 byte input data
+.macro crc32_mix2048
+ fmov s_a1, w_crc
+ movi v_neon_tmp.4s, 0
+
+ adrp x_pconst, lanchor_crc32
+ add x_buf_tmp, x_buf, 64
+
+ ldr x_data_crc0, [x_buf, 512]
+ ldr x_data_crc1, [x_buf, 1024]
+ ldr x_data_crc2, [x_buf, 1536]
+
+ crc32_u64 w_crc0, wzr, x_data_crc0
+ crc32_u64 w_crc1, wzr, x_data_crc1
+ crc32_u64 w_crc2, wzr, x_data_crc2
+
+#ifdef CRC32
+ mvn v_a1.8b, v_a1.8b
+#endif
+
+ ins v_neon_tmp.s[0], v_a1.s[0]
+
+ ld1 {v_a1.4s, v_a2.4s, v_a3.4s, v_a4.4s}, [x_buf]
+
+ ldr x_data_crc0, [x_buf, 520]
+ ldr x_data_crc1, [x_buf, 1032]
+ ldr x_data_crc2, [x_buf, 1544]
+
+ eor v_a1.16b, v_a1.16b, v_neon_tmp.16b
+ ldr q_a0, [x_pconst, #:lo12:lanchor_crc32] // k1k2
+
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+// loop start, unroll the loop
+ .align 4
+ pmull_fold
+
+ add x_buf_tmp, x_buf_tmp, 64
+ pmull_fold
+
+ add x_buf_tmp, x_buf_tmp, 64
+ pmull_fold
+
+ add x_buf_tmp, x_buf_tmp, 64
+ pmull_fold
+
+ add x_buf_tmp, x_buf_tmp, 64
+ pmull_fold
+
+ add x_buf_tmp, x_buf_tmp, 64
+ pmull_fold
+
+ add x_buf_tmp, x_buf_tmp, 64
+ pmull_fold
+// loop end
+
+// PMULL: fold into 128-bits
+ add x_pconst, x_pconst, :lo12:lanchor_crc32
+
+ ldr x_data_crc0, [x_buf, 976]
+ ldr x_data_crc1, [x_buf, 1488]
+ ldr x_data_crc2, [x_buf, 2000]
+
+ ldr q_a0, [x_pconst, offset_k3k4] // k3k4
+
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ pmull v_a5.1q, v_a1.1d, v_a0.1d
+ pmull2 v_a1.1q, v_a1.2d, v_a0.2d
+
+ eor v_a1.16b, v_a5.16b, v_a1.16b
+ eor v_a1.16b, v_a1.16b, v_a2.16b
+
+ ldr x_data_crc0, [x_buf, 984]
+ ldr x_data_crc1, [x_buf, 1496]
+ ldr x_data_crc2, [x_buf, 2008]
+
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ pmull v_a5.1q, v_a1.1d, v_a0.1d
+ pmull2 v_a1.1q, v_a1.2d, v_a0.2d
+
+ ldr x_data_crc0, [x_buf, 992]
+ ldr x_data_crc1, [x_buf, 1504]
+ ldr x_data_crc2, [x_buf, 2016]
+
+ eor v_a1.16b, v_a5.16b, v_a1.16b
+ eor v_a1.16b, v_a1.16b, v_a3.16b
+
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ pmull v_a5.1q, v_a1.1d, v_a0.1d
+ pmull2 v_a1.1q, v_a1.2d, v_a0.2d
+
+ ldr x_data_crc0, [x_buf, 1000]
+ ldr x_data_crc1, [x_buf, 1512]
+ ldr x_data_crc2, [x_buf, 2024]
+
+ eor v_a1.16b, v_a5.16b, v_a1.16b
+ eor v_a1.16b, v_a1.16b, v_a4.16b
+
+// PMULL: fold 128-bits to 64-bits
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ dup d_a0, v_a0.d[1]
+ pmull v_a2.1q, v_a1.1d, v_a0.1d
+
+ movi v_neon_zero.4s, 0
+ ldr q_k5k0, [x_pconst, offset_k5k0] // k5k0
+ adrp x_tmp, .lanchor_mask
+
+ ldr x_data_crc0, [x_buf, 1008]
+ ldr x_data_crc1, [x_buf, 1520]
+ ldr x_data_crc2, [x_buf, 2032]
+
+ ext v_a1.16b, v_a1.16b, v_neon_zero.16b, #8
+ eor v_a1.16b, v_a2.16b, v_a1.16b
+ ldr q_neon_tmp3, [x_tmp, #:lo12:.lanchor_mask]
+
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ dup d_a0, v_k5k0.d[1]
+ pmull v_a3.1q, v_a2.1d, v_a0.1d
+
+ ext v_a2.16b, v_a1.16b, v_neon_zero.16b, #4
+ and v_a1.16b, v_a1.16b, v_neon_tmp3.16b
+ pmull v_a1.1q, v_a1.1d, v_k5k0.1d
+ eor v_a1.16b, v_a2.16b, v_a1.16b
+
+// PMULL: barret reduce to 32-bits
+ ldr q_neon_tmp1, [x_pconst, offset_poly] // poly
+
+ ldr x_data_crc0, [x_buf, 1016]
+ ldr x_data_crc1, [x_buf, 1528]
+ ldr x_data_crc2, [x_buf, 2040]
+
+ dup d_neon_tmp2, v_neon_tmp1.d[1]
+
+ crc32_u64 w_crc0, w_crc0, x_data_crc0
+ crc32_u64 w_crc1, w_crc1, x_data_crc1
+ crc32_u64 w_crc2, w_crc2, x_data_crc2
+
+ and v_a2.16b, v_a1.16b, v_neon_tmp3.16b
+ pmull v_a2.1q, v_a2.1d, v_neon_tmp2.1d
+ and v_a2.16b, v_neon_tmp3.16b, v_a2.16b
+ pmull v_a2.1q, v_a2.1d, v_neon_tmp1.1d
+
+// crc_pmull result
+ eor v_a1.16b, v_a1.16b, v_a2.16b
+ dup s_crc_pmull, v_a1.s[1]
+
+// merge crc_pmull, crc0, crc1, crc2 using pmull instruction
+ fmov s_neon_crc0, w_crc0
+ fmov s_neon_crc1, w_crc1
+
+ ldr q_neon_const0, [x_pconst, offset_crc32_const]
+ ldr q_neon_const1, [x_pconst, offset_crc32_const+16]
+ ldr q_neon_const2, [x_pconst, offset_crc32_const+32]
+
+ pmull v_crc_pmull.1q, v_crc_pmull.1d, v_neon_const0.1d
+ pmull v_neon_crc0.1q, v_neon_crc0.1d, v_neon_const1.1d
+ pmull v_neon_crc1.1q, v_neon_crc1.1d, v_neon_const2.1d
+
+ fmov x_tmp1, d_neon_crc0
+ crc32_u64 w_crc0, wzr, x_tmp1
+
+ fmov x_tmp1, d_neon_crc1
+ crc32_u64 w_crc1, wzr, x_tmp1
+
+ eor w_ret_crc, w_crc1, w_crc0
+
+ fmov x_tmp1, d_crc_pmull
+ crc32_u64 w_tmp, wzr, x_tmp1
+
+ eor w_crc2, w_tmp, w_crc2
+
+// handle crc32/crc32c
+#ifdef CRC32
+ eon w_ret_crc, w_crc2, w_ret_crc
+#else
+ eor w_ret_crc, w_crc2, w_ret_crc
+#endif
+.endm
+
+// crc32 mix main default
+.macro crc32_mix_main_default
+ cmp x_len, 2047
+ mov x_len_saved, x_len
+ mov x_buf_saved, x_buf
+ bls .less_than_2048
+
+ sub x_buf_iter, x_len, #2048
+ stp x29, x30, [sp, -16]!
+
+ mov x29, sp
+ and x_buf_iter, x_buf_iter, -2048
+ add x_buf_iter, x_buf_iter, 2048
+ add x_buf_iter, x_buf, x_buf_iter
+
+ .align 4
+.loop_mix:
+ mov x_buf, x_buf_saved
+ crc32_mix2048
+
+ add x_buf_saved, x_buf_saved, 2048
+ cmp x_buf_saved, x_buf_iter
+ bne .loop_mix
+
+ and x_len_saved, x_len_saved, 2047
+ cbnz x_len_saved, .remain_ldp
+
+ ldp x29, x30, [sp], 16
+ ret
+
+ .align 4
+.remain_ldp:
+ mov w_crc_tmp, crc
+ ldp x29, x30, [sp], 16
+ mov size, x_len_saved
+ mov buf, x_buf_iter
+ b .crc32_hw_handle
+
+.remain:
+ mov w_crc_tmp, crc
+ mov size, x_len_saved
+ mov buf, x_buf_saved
+ b .crc32_hw_handle
+
+ .align 4
+.less_than_2048:
+ cbnz x_len, .remain
+ ret
+
+.crc32_hw_handle:
+ cmp size, 63
+
+#ifdef CRC32
+ mvn crc_tmp, crc_tmp
+#endif
+
+ bls .less_than_64
+ sub buf_saved, size, #64
+ and buf_saved, buf_saved, -64
+ add buf_saved, buf_saved, 64
+ add buf_saved, buf, buf_saved
+
+ .align 4
+.loop_64:
+ ldp data_tmp1, data_tmp2, [buf]
+ ldr data_tmp3, [buf, 16]
+ crc32_u64 crc_tmp, crc_tmp, data_tmp1
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+
+ ldp data_tmp1, data_tmp2, [buf, 24]
+ add buf, buf, 64
+
+ crc32_u64 crc_tmp, crc_tmp, data_tmp3
+ ldr data_tmp3, [buf, -24]
+
+ crc32_u64 crc_tmp, crc_tmp, data_tmp1
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+
+ ldp data_tmp1, data_tmp2, [buf, -16]
+ cmp buf_saved, buf
+ crc32_u64 crc_tmp, crc_tmp, data_tmp3
+
+ crc32_u64 crc_tmp, crc_tmp, data_tmp1
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+ bne .loop_64
+
+ and size, size, 63
+.less_than_64:
+ cmp size, 7
+ bls .crc32_hw_w
+
+ ldr data_tmp2, [buf]
+ sub size_tmp, size, #8
+ cmp size_tmp, 7
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+ bls .crc32_hw_w_pre
+
+ ldr data_tmp2, [buf, 8]
+ sub data_tmp3, size, #16
+ cmp data_tmp3, 7
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+ bls .crc32_hw_w_pre
+
+ ldr data_tmp2, [buf, 16]
+ sub data_tmp3, size, #24
+ cmp data_tmp3, 7
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+ bls .crc32_hw_w_pre
+
+ ldr data_tmp2, [buf, 24]
+ sub data_tmp3, size, #32
+ cmp data_tmp3, 7
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+ bls .crc32_hw_w_pre
+
+ ldr data_tmp2, [buf, 32]
+ sub data_tmp3, size, #40
+ cmp data_tmp3, 7
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+ bls .crc32_hw_w_pre
+
+ ldr data_tmp2, [buf, 40]
+ sub data_tmp3, size, #48
+ cmp data_tmp3, 7
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+ bls .crc32_hw_w_pre
+
+ ldr data_tmp2, [buf, 48]
+ crc32_u64 crc_tmp, crc_tmp, data_tmp2
+
+.crc32_hw_w_pre:
+ and size_tmp, size_tmp, -8
+ and size, size, 7
+ add size_tmp, size_tmp, 8
+ add buf, buf, size_tmp
+
+.crc32_hw_w:
+ cmp size, 3
+ bls .crc32_hw_h
+ ldr w_data_tmp2, [buf], 4
+ sub size, size, #4
+ crc32_u32 crc_tmp, crc_tmp, w_data_tmp2
+
+.crc32_hw_h:
+ cmp size, 1
+ bls .crc32_hw_b
+ ldrh w_data_tmp2, [buf], 2
+ sub size, size, #2
+ crc32_u16 crc_tmp, crc_tmp, w_data_tmp2
+
+.crc32_hw_b:
+ cbz size, .crc32_hw_done
+ ldrb w_data_tmp2, [buf]
+ crc32_u8 crc_tmp, crc_tmp, w_data_tmp2
+
+.crc32_hw_done:
+#ifdef CRC32
+ mvn ret_crc, crc_tmp
+#else
+ mov ret_crc, crc_tmp
+#endif
+ ret
+.endm
diff --git a/src/isa-l/crc/aarch64/crc32_mix_neoverse_n1.S b/src/isa-l/crc/aarch64/crc32_mix_neoverse_n1.S
new file mode 100644
index 000000000..62b40e1f2
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_mix_neoverse_n1.S
@@ -0,0 +1,70 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .text
+ .align 6
+ .arch armv8-a+crypto+crc
+
+#include "crc32_common_mix_neoverse_n1.S"
+.Lconstants:
+ .octa 0x00000001c6e415960000000154442bd4
+ .octa 0x00000000ccaa009e00000001751997d0
+ .octa 0x00000001F701164100000001DB710641
+ .quad 0x0000000163cd6124
+ .quad 0x00000000FFFFFFFF
+ .quad 0x000000001753ab84
+.macro crc32_u64 dst,src,data
+ crc32x \dst,\src,\data
+.endm
+.macro crc32_u32 dst,src,data
+ crc32w \dst,\src,\data
+.endm
+.macro crc32_u16 dst,src,data
+ crc32h \dst,\src,\data
+.endm
+.macro crc32_u8 dst,src,data
+ crc32b \dst,\src,\data
+.endm
+
+
+/**
+ * uint32_t crc32_mix_neoverse_n1(uint CRC ,uint8_t * BUF,
+ * size_t LEN)
+ */
+ BUF .req x1
+ LEN .req x2
+ CRC .req x0
+ wCRC .req w0
+ .align 6
+ .global crc32_mix_neoverse_n1
+ .type crc32_mix_neoverse_n1, %function
+crc32_mix_neoverse_n1:
+ crc32_common_mix crc32
+ .size crc32_mix_neoverse_n1, .-crc32_mix_neoverse_n1
+
diff --git a/src/isa-l/crc/aarch64/crc32_norm_common_pmull.h b/src/isa-l/crc/aarch64/crc32_norm_common_pmull.h
new file mode 100644
index 000000000..7377e30a1
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_norm_common_pmull.h
@@ -0,0 +1,135 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc_common_pmull.h"
+
+.macro crc32_norm_func name:req
+ .arch armv8-a+crypto
+ .text
+ .align 3
+ .global \name
+ .type \name, %function
+
+/* uint32_t crc32_norm_func(uint32_t seed, uint8_t * buf, uint64_t len) */
+
+\name\():
+ mvn w_seed, w_seed
+ mov x_counter, 0
+ cmp x_len, (FOLD_SIZE - 1)
+ bhi .crc_clmul_pre
+
+.crc_tab_pre:
+ cmp x_len, x_counter
+ bls .done
+
+ adrp x_tmp, .lanchor_crc_tab
+ add x_buf_iter, x_buf, x_counter
+ add x_buf, x_buf, x_len
+ add x_crc_tab_addr, x_tmp, :lo12:.lanchor_crc_tab
+
+ .align 3
+.loop_crc_tab:
+ ldrb w_tmp, [x_buf_iter], 1
+ cmp x_buf, x_buf_iter
+ eor w_tmp, w_tmp, w_seed, lsr 24
+ ldr w_tmp, [x_crc_tab_addr, w_tmp, uxtw 2]
+ eor w_seed, w_tmp, w_seed, lsl 8
+ bhi .loop_crc_tab
+
+.done:
+ mvn w_crc_ret, w_seed
+ ret
+
+ .align 2
+.crc_clmul_pre:
+ lsl x_seed, x_seed, 32
+ movi v_x0.2s, 0
+ fmov v_x0.d[1], x_seed // save crc to v_x0
+
+ crc_norm_load_first_block
+
+ bls .clmul_loop_end
+
+ crc32_load_p4
+
+// 1024bit --> 512bit loop
+// merge x0, x1, x2, x3, y0, y1, y2, y3 => x0, x1, x2, x3 (uint64x2_t)
+ crc_norm_loop
+
+.clmul_loop_end:
+// folding 512bit --> 128bit
+ crc32_fold_512b_to_128b
+
+// folding 128bit --> 64bit
+ mov x_tmp, p0_high_b0
+ movk x_tmp, p0_high_b1, lsl 16
+ fmov d_p0_high, x_tmp
+
+ mov x_tmp2, p0_low_b0
+ movk x_tmp2, p0_low_b1, lsl 16
+ fmov d_p0_high2, x_tmp2
+
+ mov d_tmp_high, v_x3.d[0]
+ ext v_tmp_high.16b, v_tmp_high.16b, v_tmp_high.16b, #12
+
+ pmull2 v_x3.1q, v_x3.2d, v_p0.2d
+
+ eor v_tmp_high.16b, v_tmp_high.16b, v_x3.16b
+ pmull2 v_x3.1q, v_tmp_high.2d, v_p02.2d
+
+// barrett reduction
+ mov x_tmp2, br_high_b0
+ movk x_tmp2, br_high_b1, lsl 16
+ movk x_tmp2, br_high_b2, lsl 32
+ fmov d_br_high, x_tmp2
+
+ mov x_tmp, br_low_b0
+ movk x_tmp, br_low_b1, lsl 16
+ movk x_tmp, br_low_b2, lsl 32
+ fmov d_br_low, x_tmp
+
+ eor v_tmp_high.16b, v_tmp_high.16b, v_x3.16b
+ mov s_x3, v_tmp_high.s[1]
+ pmull v_x3.1q, v_x3.1d, v_br_low.1d
+
+ mov s_x3, v_x3.s[1]
+ pmull v_x3.1q, v_x3.1d, v_br_high.1d
+ eor v_tmp_high.8b, v_tmp_high.8b, v_x3.8b
+ umov w_seed, v_tmp_high.s[0]
+
+ b .crc_tab_pre
+
+ .size \name, .-\name
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 4
+.shuffle_data:
+ .byte 15, 14, 13, 12, 11, 10, 9
+ .byte 8, 7, 6, 5, 4, 3, 2, 1, 0
+.endm
diff --git a/src/isa-l/crc/aarch64/crc32_refl_common_pmull.h b/src/isa-l/crc/aarch64/crc32_refl_common_pmull.h
new file mode 100644
index 000000000..6418f1240
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32_refl_common_pmull.h
@@ -0,0 +1,126 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc_common_pmull.h"
+
+.macro crc32_refl_func name:req
+ .arch armv8-a+crypto
+ .text
+ .align 3
+ .global \name
+ .type \name, %function
+
+/* uint32_t crc32_refl_func(uint32_t seed, uint8_t * buf, uint64_t len) */
+
+\name\():
+ mvn w_seed, w_seed
+ mov x_counter, 0
+ cmp x_len, (FOLD_SIZE - 1)
+ bhi .crc32_clmul_pre
+
+.crc_tab_pre:
+ cmp x_len, x_counter
+ bls .done
+
+ adrp x_tmp, .lanchor_crc_tab
+ add x_buf_iter, x_buf, x_counter
+ add x_buf, x_buf, x_len
+ add x_crc_tab_addr, x_tmp, :lo12:.lanchor_crc_tab
+
+ .align 3
+.loop_crc_tab:
+ ldrb w_tmp, [x_buf_iter], 1
+ cmp x_buf, x_buf_iter
+ eor w_tmp, w_tmp, w_seed
+ and w_tmp, w_tmp, 255
+ ldr w_tmp, [x_crc_tab_addr, w_tmp, uxtw 2]
+ eor w_seed, w_tmp, w_seed, lsr 8
+ bhi .loop_crc_tab
+
+.done:
+ mvn w_crc_ret, w_seed
+ ret
+
+ .align 2
+.crc32_clmul_pre:
+ fmov s_x0, w_seed // save crc to s_x0
+
+ crc_refl_load_first_block
+
+ bls .clmul_loop_end
+
+ crc32_load_p4
+
+// 1024bit --> 512bit loop
+// merge x0, x1, x2, x3, y0, y1, y2, y3 => x0, x1, x2, x3 (uint64x2_t)
+ crc_refl_loop
+
+.clmul_loop_end:
+// folding 512bit --> 128bit
+ crc32_fold_512b_to_128b
+
+// folding 128bit --> 64bit
+ mov x_tmp, p0_low_b0
+ movk x_tmp, p0_low_b1, lsl 16
+ fmov d_p0_low2, x_tmp
+
+ mov d_tmp_high, v_x3.d[1]
+
+ mov d_p0_low, v_p1.d[1]
+ pmull v_x3.1q, v_x3.1d, v_p0.1d
+
+ eor v_tmp_high.16b, v_tmp_high.16b, v_x3.16b
+ mov s_x3, v_tmp_high.s[0]
+ ext v_tmp_high.16b, v_tmp_high.16b, v_tmp_high.16b, #4
+ pmull v_x3.1q, v_x3.1d, v_p02.1d
+
+// barrett reduction
+ mov x_tmp2, br_high_b0
+ movk x_tmp2, br_high_b1, lsl 16
+ movk x_tmp2, br_high_b2, lsl 32
+ fmov d_br_high, x_tmp2
+
+ mov x_tmp, br_low_b0
+ movk x_tmp, br_low_b1, lsl 16
+ movk x_tmp, br_low_b2, lsl 32
+ fmov d_br_low, x_tmp
+
+ eor v_tmp_high.16b, v_tmp_high.16b, v_x3.16b
+ mov s_x3, v_tmp_high.s[0]
+ pmull v_x3.1q, v_x3.1d, v_br_high.1d
+
+ mov s_x3, v_x3.s[0]
+ pmull v_x3.1q, v_x3.1d, v_br_low.1d
+ eor v_tmp_high.8b, v_tmp_high.8b, v_x3.8b
+ umov w_seed, v_tmp_high.s[1]
+
+ b .crc_tab_pre
+
+ .size \name, .-\name
+.endm
diff --git a/src/isa-l/crc/aarch64/crc32c_mix_default.S b/src/isa-l/crc/aarch64/crc32c_mix_default.S
new file mode 100644
index 000000000..87b8ce39c
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32c_mix_default.S
@@ -0,0 +1,109 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .text
+ .arch armv8-a+crypto+crc
+ .align 6
+
+.macro crc32_u64 dst,src,data
+ crc32cx \dst,\src,\data
+.endm
+
+.macro crc32_u32 dst,src,data
+ crc32cw \dst,\src,\data
+.endm
+
+.macro crc32_u16 dst,src,data
+ crc32ch \dst,\src,\data
+.endm
+
+.macro crc32_u8 dst,src,data
+ crc32cb \dst,\src,\data
+.endm
+
+#include "crc32_mix_default_common.S"
+
+ .global crc32c_mix_default
+ .type crc32c_mix_default, %function
+crc32c_mix_default:
+ mov w3, w2
+ sxtw x2, w1
+ mov x1, x0
+ mov w0, w3
+ crc32_mix_main_default
+ .size crc32c_mix_default, .-crc32c_mix_default
+
+ .section .rodata
+ .align 4
+ .set lanchor_crc32,. + 0
+
+ .type k1k2, %object
+ .size k1k2, 16
+k1k2:
+ .xword 0x00740eef02
+ .xword 0x009e4addf8
+
+ .type k3k4, %object
+ .size k3k4, 16
+k3k4:
+ .xword 0x00f20c0dfe
+ .xword 0x014cd00bd6
+
+ .type k5k0, %object
+ .size k5k0, 16
+k5k0:
+ .xword 0x00dd45aab8
+ .xword 0
+
+ .type poly, %object
+ .size poly, 16
+poly:
+ .xword 0x0105ec76f0
+ .xword 0x00dea713f1
+
+ .type crc32_const, %object
+ .size crc32_const, 48
+crc32_const:
+ .xword 0x9ef68d35
+ .xword 0
+ .xword 0x170076fa
+ .xword 0
+ .xword 0xdd7e3b0c
+ .xword 0
+
+ .align 4
+ .set .lanchor_mask,. + 0
+
+ .type mask, %object
+ .size mask, 16
+mask:
+ .word -1
+ .word 0
+ .word -1
+ .word 0
diff --git a/src/isa-l/crc/aarch64/crc32c_mix_neoverse_n1.S b/src/isa-l/crc/aarch64/crc32c_mix_neoverse_n1.S
new file mode 100644
index 000000000..a98511aab
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc32c_mix_neoverse_n1.S
@@ -0,0 +1,68 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .text
+ .align 6
+ .arch armv8-a+crypto+crc
+
+#include "crc32_common_mix_neoverse_n1.S"
+.Lconstants:
+ .octa 0x000000009e4addf800000000740eef02
+ .octa 0x000000014cd00bd600000000f20c0dfe
+ .octa 0x00000000dea713f10000000105ec76f0
+ .quad 0x00000000dd45aab8
+ .quad 0x00000000FFFFFFFF
+ .quad 0x000000009ef68d35
+
+.macro crc32_u64 dst,src,data
+ crc32cx \dst,\src,\data
+.endm
+.macro crc32_u32 dst,src,data
+ crc32cw \dst,\src,\data
+.endm
+.macro crc32_u16 dst,src,data
+ crc32ch \dst,\src,\data
+.endm
+.macro crc32_u8 dst,src,data
+ crc32cb \dst,\src,\data
+.endm
+/**
+ * uint32_t crc32c_mix_neoverse_n1(uint8_t * BUF,
+ * size_t LEN, uint CRC)
+ */
+ BUF .req x0
+ LEN .req x1
+ CRC .req x2
+ wCRC .req w2
+ .align 6
+ .global crc32c_mix_neoverse_n1
+ .type crc32c_mix_neoverse_n1, %function
+crc32c_mix_neoverse_n1:
+ crc32_common_mix crc32c
+ .size crc32c_mix_neoverse_n1, .-crc32c_mix_neoverse_n1
diff --git a/src/isa-l/crc/aarch64/crc64_ecma_norm_pmull.S b/src/isa-l/crc/aarch64/crc64_ecma_norm_pmull.S
new file mode 100644
index 000000000..0089a09de
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_ecma_norm_pmull.S
@@ -0,0 +1,33 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc64_ecma_norm_pmull.h"
+#include "crc64_norm_common_pmull.h"
+
+crc64_norm_func crc64_ecma_norm_pmull
diff --git a/src/isa-l/crc/aarch64/crc64_ecma_norm_pmull.h b/src/isa-l/crc/aarch64/crc64_ecma_norm_pmull.h
new file mode 100644
index 000000000..07d58cd87
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_ecma_norm_pmull.h
@@ -0,0 +1,200 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.equ p4_low_b0, (0xf020)
+.equ p4_low_b1, 0x540d
+.equ p4_low_b2, 0x43ca
+.equ p4_low_b3, 0x5f68
+.equ p4_high_b0, 0xb83f
+.equ p4_high_b1, 0x1205
+.equ p4_high_b2, 0xb698
+.equ p4_high_b3, 0xddf4
+
+.equ p1_low_b0, (0xfab6)
+.equ p1_low_b1, 0xeb52
+.equ p1_low_b2, 0xc3c7
+.equ p1_low_b3, 0x05f5
+.equ p1_high_b0, 0x740e
+.equ p1_high_b1, 0xd257
+.equ p1_high_b2, 0x38a7
+.equ p1_high_b3, 0x4eb9
+
+.equ p0_low_b0, (0xfab6)
+.equ p0_low_b1, 0xeb52
+.equ p0_low_b2, 0xc3c7
+.equ p0_low_b3, 0x05f5
+.equ p0_high_b0, 0x0
+.equ p0_high_b1, 0x0
+.equ p0_high_b2, 0x0
+.equ p0_high_b3, 0x0
+
+.equ br_low_b0, (0xf872)
+.equ br_low_b1, 0x6cc4
+.equ br_low_b2, 0x29d0
+.equ br_low_b3, 0x578d
+.equ br_high_b0, 0x3693
+.equ br_high_b1, 0xa9ea
+.equ br_high_b2, 0xe1eb
+.equ br_high_b3, 0x42f0
+
+ .text
+ .section .rodata
+ .align 4
+ .set .lanchor_crc_tab,. + 0
+ .type crc64_tab, %object
+ .size crc64_tab, 2048
+crc64_tab:
+ .xword 0x0000000000000000, 0x42f0e1eba9ea3693
+ .xword 0x85e1c3d753d46d26, 0xc711223cfa3e5bb5
+ .xword 0x493366450e42ecdf, 0x0bc387aea7a8da4c
+ .xword 0xccd2a5925d9681f9, 0x8e224479f47cb76a
+ .xword 0x9266cc8a1c85d9be, 0xd0962d61b56fef2d
+ .xword 0x17870f5d4f51b498, 0x5577eeb6e6bb820b
+ .xword 0xdb55aacf12c73561, 0x99a54b24bb2d03f2
+ .xword 0x5eb4691841135847, 0x1c4488f3e8f96ed4
+ .xword 0x663d78ff90e185ef, 0x24cd9914390bb37c
+ .xword 0xe3dcbb28c335e8c9, 0xa12c5ac36adfde5a
+ .xword 0x2f0e1eba9ea36930, 0x6dfeff5137495fa3
+ .xword 0xaaefdd6dcd770416, 0xe81f3c86649d3285
+ .xword 0xf45bb4758c645c51, 0xb6ab559e258e6ac2
+ .xword 0x71ba77a2dfb03177, 0x334a9649765a07e4
+ .xword 0xbd68d2308226b08e, 0xff9833db2bcc861d
+ .xword 0x388911e7d1f2dda8, 0x7a79f00c7818eb3b
+ .xword 0xcc7af1ff21c30bde, 0x8e8a101488293d4d
+ .xword 0x499b3228721766f8, 0x0b6bd3c3dbfd506b
+ .xword 0x854997ba2f81e701, 0xc7b97651866bd192
+ .xword 0x00a8546d7c558a27, 0x4258b586d5bfbcb4
+ .xword 0x5e1c3d753d46d260, 0x1cecdc9e94ace4f3
+ .xword 0xdbfdfea26e92bf46, 0x990d1f49c77889d5
+ .xword 0x172f5b3033043ebf, 0x55dfbadb9aee082c
+ .xword 0x92ce98e760d05399, 0xd03e790cc93a650a
+ .xword 0xaa478900b1228e31, 0xe8b768eb18c8b8a2
+ .xword 0x2fa64ad7e2f6e317, 0x6d56ab3c4b1cd584
+ .xword 0xe374ef45bf6062ee, 0xa1840eae168a547d
+ .xword 0x66952c92ecb40fc8, 0x2465cd79455e395b
+ .xword 0x3821458aada7578f, 0x7ad1a461044d611c
+ .xword 0xbdc0865dfe733aa9, 0xff3067b657990c3a
+ .xword 0x711223cfa3e5bb50, 0x33e2c2240a0f8dc3
+ .xword 0xf4f3e018f031d676, 0xb60301f359dbe0e5
+ .xword 0xda050215ea6c212f, 0x98f5e3fe438617bc
+ .xword 0x5fe4c1c2b9b84c09, 0x1d14202910527a9a
+ .xword 0x93366450e42ecdf0, 0xd1c685bb4dc4fb63
+ .xword 0x16d7a787b7faa0d6, 0x5427466c1e109645
+ .xword 0x4863ce9ff6e9f891, 0x0a932f745f03ce02
+ .xword 0xcd820d48a53d95b7, 0x8f72eca30cd7a324
+ .xword 0x0150a8daf8ab144e, 0x43a04931514122dd
+ .xword 0x84b16b0dab7f7968, 0xc6418ae602954ffb
+ .xword 0xbc387aea7a8da4c0, 0xfec89b01d3679253
+ .xword 0x39d9b93d2959c9e6, 0x7b2958d680b3ff75
+ .xword 0xf50b1caf74cf481f, 0xb7fbfd44dd257e8c
+ .xword 0x70eadf78271b2539, 0x321a3e938ef113aa
+ .xword 0x2e5eb66066087d7e, 0x6cae578bcfe24bed
+ .xword 0xabbf75b735dc1058, 0xe94f945c9c3626cb
+ .xword 0x676dd025684a91a1, 0x259d31cec1a0a732
+ .xword 0xe28c13f23b9efc87, 0xa07cf2199274ca14
+ .xword 0x167ff3eacbaf2af1, 0x548f120162451c62
+ .xword 0x939e303d987b47d7, 0xd16ed1d631917144
+ .xword 0x5f4c95afc5edc62e, 0x1dbc74446c07f0bd
+ .xword 0xdaad56789639ab08, 0x985db7933fd39d9b
+ .xword 0x84193f60d72af34f, 0xc6e9de8b7ec0c5dc
+ .xword 0x01f8fcb784fe9e69, 0x43081d5c2d14a8fa
+ .xword 0xcd2a5925d9681f90, 0x8fdab8ce70822903
+ .xword 0x48cb9af28abc72b6, 0x0a3b7b1923564425
+ .xword 0x70428b155b4eaf1e, 0x32b26afef2a4998d
+ .xword 0xf5a348c2089ac238, 0xb753a929a170f4ab
+ .xword 0x3971ed50550c43c1, 0x7b810cbbfce67552
+ .xword 0xbc902e8706d82ee7, 0xfe60cf6caf321874
+ .xword 0xe224479f47cb76a0, 0xa0d4a674ee214033
+ .xword 0x67c58448141f1b86, 0x253565a3bdf52d15
+ .xword 0xab1721da49899a7f, 0xe9e7c031e063acec
+ .xword 0x2ef6e20d1a5df759, 0x6c0603e6b3b7c1ca
+ .xword 0xf6fae5c07d3274cd, 0xb40a042bd4d8425e
+ .xword 0x731b26172ee619eb, 0x31ebc7fc870c2f78
+ .xword 0xbfc9838573709812, 0xfd39626eda9aae81
+ .xword 0x3a28405220a4f534, 0x78d8a1b9894ec3a7
+ .xword 0x649c294a61b7ad73, 0x266cc8a1c85d9be0
+ .xword 0xe17dea9d3263c055, 0xa38d0b769b89f6c6
+ .xword 0x2daf4f0f6ff541ac, 0x6f5faee4c61f773f
+ .xword 0xa84e8cd83c212c8a, 0xeabe6d3395cb1a19
+ .xword 0x90c79d3fedd3f122, 0xd2377cd44439c7b1
+ .xword 0x15265ee8be079c04, 0x57d6bf0317edaa97
+ .xword 0xd9f4fb7ae3911dfd, 0x9b041a914a7b2b6e
+ .xword 0x5c1538adb04570db, 0x1ee5d94619af4648
+ .xword 0x02a151b5f156289c, 0x4051b05e58bc1e0f
+ .xword 0x87409262a28245ba, 0xc5b073890b687329
+ .xword 0x4b9237f0ff14c443, 0x0962d61b56fef2d0
+ .xword 0xce73f427acc0a965, 0x8c8315cc052a9ff6
+ .xword 0x3a80143f5cf17f13, 0x7870f5d4f51b4980
+ .xword 0xbf61d7e80f251235, 0xfd913603a6cf24a6
+ .xword 0x73b3727a52b393cc, 0x31439391fb59a55f
+ .xword 0xf652b1ad0167feea, 0xb4a25046a88dc879
+ .xword 0xa8e6d8b54074a6ad, 0xea16395ee99e903e
+ .xword 0x2d071b6213a0cb8b, 0x6ff7fa89ba4afd18
+ .xword 0xe1d5bef04e364a72, 0xa3255f1be7dc7ce1
+ .xword 0x64347d271de22754, 0x26c49cccb40811c7
+ .xword 0x5cbd6cc0cc10fafc, 0x1e4d8d2b65facc6f
+ .xword 0xd95caf179fc497da, 0x9bac4efc362ea149
+ .xword 0x158e0a85c2521623, 0x577eeb6e6bb820b0
+ .xword 0x906fc95291867b05, 0xd29f28b9386c4d96
+ .xword 0xcedba04ad0952342, 0x8c2b41a1797f15d1
+ .xword 0x4b3a639d83414e64, 0x09ca82762aab78f7
+ .xword 0x87e8c60fded7cf9d, 0xc51827e4773df90e
+ .xword 0x020905d88d03a2bb, 0x40f9e43324e99428
+ .xword 0x2cffe7d5975e55e2, 0x6e0f063e3eb46371
+ .xword 0xa91e2402c48a38c4, 0xebeec5e96d600e57
+ .xword 0x65cc8190991cb93d, 0x273c607b30f68fae
+ .xword 0xe02d4247cac8d41b, 0xa2dda3ac6322e288
+ .xword 0xbe992b5f8bdb8c5c, 0xfc69cab42231bacf
+ .xword 0x3b78e888d80fe17a, 0x7988096371e5d7e9
+ .xword 0xf7aa4d1a85996083, 0xb55aacf12c735610
+ .xword 0x724b8ecdd64d0da5, 0x30bb6f267fa73b36
+ .xword 0x4ac29f2a07bfd00d, 0x08327ec1ae55e69e
+ .xword 0xcf235cfd546bbd2b, 0x8dd3bd16fd818bb8
+ .xword 0x03f1f96f09fd3cd2, 0x41011884a0170a41
+ .xword 0x86103ab85a2951f4, 0xc4e0db53f3c36767
+ .xword 0xd8a453a01b3a09b3, 0x9a54b24bb2d03f20
+ .xword 0x5d45907748ee6495, 0x1fb5719ce1045206
+ .xword 0x919735e51578e56c, 0xd367d40ebc92d3ff
+ .xword 0x1476f63246ac884a, 0x568617d9ef46bed9
+ .xword 0xe085162ab69d5e3c, 0xa275f7c11f7768af
+ .xword 0x6564d5fde549331a, 0x279434164ca30589
+ .xword 0xa9b6706fb8dfb2e3, 0xeb46918411358470
+ .xword 0x2c57b3b8eb0bdfc5, 0x6ea7525342e1e956
+ .xword 0x72e3daa0aa188782, 0x30133b4b03f2b111
+ .xword 0xf7021977f9cceaa4, 0xb5f2f89c5026dc37
+ .xword 0x3bd0bce5a45a6b5d, 0x79205d0e0db05dce
+ .xword 0xbe317f32f78e067b, 0xfcc19ed95e6430e8
+ .xword 0x86b86ed5267cdbd3, 0xc4488f3e8f96ed40
+ .xword 0x0359ad0275a8b6f5, 0x41a94ce9dc428066
+ .xword 0xcf8b0890283e370c, 0x8d7be97b81d4019f
+ .xword 0x4a6acb477bea5a2a, 0x089a2aacd2006cb9
+ .xword 0x14dea25f3af9026d, 0x562e43b4931334fe
+ .xword 0x913f6188692d6f4b, 0xd3cf8063c0c759d8
+ .xword 0x5dedc41a34bbeeb2, 0x1f1d25f19d51d821
+ .xword 0xd80c07cd676f8394, 0x9afce626ce85b507
diff --git a/src/isa-l/crc/aarch64/crc64_ecma_refl_pmull.S b/src/isa-l/crc/aarch64/crc64_ecma_refl_pmull.S
new file mode 100644
index 000000000..812517f77
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_ecma_refl_pmull.S
@@ -0,0 +1,33 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc64_ecma_refl_pmull.h"
+#include "crc64_refl_common_pmull.h"
+
+crc64_refl_func crc64_ecma_refl_pmull
diff --git a/src/isa-l/crc/aarch64/crc64_ecma_refl_pmull.h b/src/isa-l/crc/aarch64/crc64_ecma_refl_pmull.h
new file mode 100644
index 000000000..5f53d7903
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_ecma_refl_pmull.h
@@ -0,0 +1,196 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.equ p4_low_b0, 0x41f3
+.equ p4_low_b1, 0x9dd4
+.equ p4_low_b2, 0xefbb
+.equ p4_low_b3, 0x6ae3
+.equ p4_high_b0, 0x2df4
+.equ p4_high_b1, 0xa784
+.equ p4_high_b2, 0x6054
+.equ p4_high_b3, 0x081f
+
+.equ p1_low_b0, 0x3ae4
+.equ p1_low_b1, 0xca39
+.equ p1_low_b2, 0xd497
+.equ p1_low_b3, 0xe05d
+.equ p1_high_b0, 0x5f40
+.equ p1_high_b1, 0xc787
+.equ p1_high_b2, 0x95af
+.equ p1_high_b3, 0xdabe
+
+.equ p0_low_b0, 0x5f40
+.equ p0_low_b1, 0xc787
+.equ p0_low_b2, 0x95af
+.equ p0_low_b3, 0xdabe
+
+.equ br_low_b0, 0x63d5
+.equ br_low_b1, 0x1729
+.equ br_low_b2, 0x466c
+.equ br_low_b3, 0x9c3e
+.equ br_high_b0, 0x1e85
+.equ br_high_b1, 0xaf0e
+.equ br_high_b2, 0xaf2b
+.equ br_high_b3, 0x92d8
+
+ .text
+ .section .rodata
+ .align 4
+ .set .lanchor_crc_tab,. + 0
+ .type crc64_tab, %object
+ .size crc64_tab, 2048
+crc64_tab:
+ .xword 0x0000000000000000, 0xb32e4cbe03a75f6f
+ .xword 0xf4843657a840a05b, 0x47aa7ae9abe7ff34
+ .xword 0x7bd0c384ff8f5e33, 0xc8fe8f3afc28015c
+ .xword 0x8f54f5d357cffe68, 0x3c7ab96d5468a107
+ .xword 0xf7a18709ff1ebc66, 0x448fcbb7fcb9e309
+ .xword 0x0325b15e575e1c3d, 0xb00bfde054f94352
+ .xword 0x8c71448d0091e255, 0x3f5f08330336bd3a
+ .xword 0x78f572daa8d1420e, 0xcbdb3e64ab761d61
+ .xword 0x7d9ba13851336649, 0xceb5ed8652943926
+ .xword 0x891f976ff973c612, 0x3a31dbd1fad4997d
+ .xword 0x064b62bcaebc387a, 0xb5652e02ad1b6715
+ .xword 0xf2cf54eb06fc9821, 0x41e11855055bc74e
+ .xword 0x8a3a2631ae2dda2f, 0x39146a8fad8a8540
+ .xword 0x7ebe1066066d7a74, 0xcd905cd805ca251b
+ .xword 0xf1eae5b551a2841c, 0x42c4a90b5205db73
+ .xword 0x056ed3e2f9e22447, 0xb6409f5cfa457b28
+ .xword 0xfb374270a266cc92, 0x48190ecea1c193fd
+ .xword 0x0fb374270a266cc9, 0xbc9d3899098133a6
+ .xword 0x80e781f45de992a1, 0x33c9cd4a5e4ecdce
+ .xword 0x7463b7a3f5a932fa, 0xc74dfb1df60e6d95
+ .xword 0x0c96c5795d7870f4, 0xbfb889c75edf2f9b
+ .xword 0xf812f32ef538d0af, 0x4b3cbf90f69f8fc0
+ .xword 0x774606fda2f72ec7, 0xc4684a43a15071a8
+ .xword 0x83c230aa0ab78e9c, 0x30ec7c140910d1f3
+ .xword 0x86ace348f355aadb, 0x3582aff6f0f2f5b4
+ .xword 0x7228d51f5b150a80, 0xc10699a158b255ef
+ .xword 0xfd7c20cc0cdaf4e8, 0x4e526c720f7dab87
+ .xword 0x09f8169ba49a54b3, 0xbad65a25a73d0bdc
+ .xword 0x710d64410c4b16bd, 0xc22328ff0fec49d2
+ .xword 0x85895216a40bb6e6, 0x36a71ea8a7ace989
+ .xword 0x0adda7c5f3c4488e, 0xb9f3eb7bf06317e1
+ .xword 0xfe5991925b84e8d5, 0x4d77dd2c5823b7ba
+ .xword 0x64b62bcaebc387a1, 0xd7986774e864d8ce
+ .xword 0x90321d9d438327fa, 0x231c512340247895
+ .xword 0x1f66e84e144cd992, 0xac48a4f017eb86fd
+ .xword 0xebe2de19bc0c79c9, 0x58cc92a7bfab26a6
+ .xword 0x9317acc314dd3bc7, 0x2039e07d177a64a8
+ .xword 0x67939a94bc9d9b9c, 0xd4bdd62abf3ac4f3
+ .xword 0xe8c76f47eb5265f4, 0x5be923f9e8f53a9b
+ .xword 0x1c4359104312c5af, 0xaf6d15ae40b59ac0
+ .xword 0x192d8af2baf0e1e8, 0xaa03c64cb957be87
+ .xword 0xeda9bca512b041b3, 0x5e87f01b11171edc
+ .xword 0x62fd4976457fbfdb, 0xd1d305c846d8e0b4
+ .xword 0x96797f21ed3f1f80, 0x2557339fee9840ef
+ .xword 0xee8c0dfb45ee5d8e, 0x5da24145464902e1
+ .xword 0x1a083bacedaefdd5, 0xa9267712ee09a2ba
+ .xword 0x955cce7fba6103bd, 0x267282c1b9c65cd2
+ .xword 0x61d8f8281221a3e6, 0xd2f6b4961186fc89
+ .xword 0x9f8169ba49a54b33, 0x2caf25044a02145c
+ .xword 0x6b055fede1e5eb68, 0xd82b1353e242b407
+ .xword 0xe451aa3eb62a1500, 0x577fe680b58d4a6f
+ .xword 0x10d59c691e6ab55b, 0xa3fbd0d71dcdea34
+ .xword 0x6820eeb3b6bbf755, 0xdb0ea20db51ca83a
+ .xword 0x9ca4d8e41efb570e, 0x2f8a945a1d5c0861
+ .xword 0x13f02d374934a966, 0xa0de61894a93f609
+ .xword 0xe7741b60e174093d, 0x545a57dee2d35652
+ .xword 0xe21ac88218962d7a, 0x5134843c1b317215
+ .xword 0x169efed5b0d68d21, 0xa5b0b26bb371d24e
+ .xword 0x99ca0b06e7197349, 0x2ae447b8e4be2c26
+ .xword 0x6d4e3d514f59d312, 0xde6071ef4cfe8c7d
+ .xword 0x15bb4f8be788911c, 0xa6950335e42fce73
+ .xword 0xe13f79dc4fc83147, 0x521135624c6f6e28
+ .xword 0x6e6b8c0f1807cf2f, 0xdd45c0b11ba09040
+ .xword 0x9aefba58b0476f74, 0x29c1f6e6b3e0301b
+ .xword 0xc96c5795d7870f42, 0x7a421b2bd420502d
+ .xword 0x3de861c27fc7af19, 0x8ec62d7c7c60f076
+ .xword 0xb2bc941128085171, 0x0192d8af2baf0e1e
+ .xword 0x4638a2468048f12a, 0xf516eef883efae45
+ .xword 0x3ecdd09c2899b324, 0x8de39c222b3eec4b
+ .xword 0xca49e6cb80d9137f, 0x7967aa75837e4c10
+ .xword 0x451d1318d716ed17, 0xf6335fa6d4b1b278
+ .xword 0xb199254f7f564d4c, 0x02b769f17cf11223
+ .xword 0xb4f7f6ad86b4690b, 0x07d9ba1385133664
+ .xword 0x4073c0fa2ef4c950, 0xf35d8c442d53963f
+ .xword 0xcf273529793b3738, 0x7c0979977a9c6857
+ .xword 0x3ba3037ed17b9763, 0x888d4fc0d2dcc80c
+ .xword 0x435671a479aad56d, 0xf0783d1a7a0d8a02
+ .xword 0xb7d247f3d1ea7536, 0x04fc0b4dd24d2a59
+ .xword 0x3886b22086258b5e, 0x8ba8fe9e8582d431
+ .xword 0xcc0284772e652b05, 0x7f2cc8c92dc2746a
+ .xword 0x325b15e575e1c3d0, 0x8175595b76469cbf
+ .xword 0xc6df23b2dda1638b, 0x75f16f0cde063ce4
+ .xword 0x498bd6618a6e9de3, 0xfaa59adf89c9c28c
+ .xword 0xbd0fe036222e3db8, 0x0e21ac88218962d7
+ .xword 0xc5fa92ec8aff7fb6, 0x76d4de52895820d9
+ .xword 0x317ea4bb22bfdfed, 0x8250e80521188082
+ .xword 0xbe2a516875702185, 0x0d041dd676d77eea
+ .xword 0x4aae673fdd3081de, 0xf9802b81de97deb1
+ .xword 0x4fc0b4dd24d2a599, 0xfceef8632775faf6
+ .xword 0xbb44828a8c9205c2, 0x086ace348f355aad
+ .xword 0x34107759db5dfbaa, 0x873e3be7d8faa4c5
+ .xword 0xc094410e731d5bf1, 0x73ba0db070ba049e
+ .xword 0xb86133d4dbcc19ff, 0x0b4f7f6ad86b4690
+ .xword 0x4ce50583738cb9a4, 0xffcb493d702be6cb
+ .xword 0xc3b1f050244347cc, 0x709fbcee27e418a3
+ .xword 0x3735c6078c03e797, 0x841b8ab98fa4b8f8
+ .xword 0xadda7c5f3c4488e3, 0x1ef430e13fe3d78c
+ .xword 0x595e4a08940428b8, 0xea7006b697a377d7
+ .xword 0xd60abfdbc3cbd6d0, 0x6524f365c06c89bf
+ .xword 0x228e898c6b8b768b, 0x91a0c532682c29e4
+ .xword 0x5a7bfb56c35a3485, 0xe955b7e8c0fd6bea
+ .xword 0xaeffcd016b1a94de, 0x1dd181bf68bdcbb1
+ .xword 0x21ab38d23cd56ab6, 0x9285746c3f7235d9
+ .xword 0xd52f0e859495caed, 0x6601423b97329582
+ .xword 0xd041dd676d77eeaa, 0x636f91d96ed0b1c5
+ .xword 0x24c5eb30c5374ef1, 0x97eba78ec690119e
+ .xword 0xab911ee392f8b099, 0x18bf525d915feff6
+ .xword 0x5f1528b43ab810c2, 0xec3b640a391f4fad
+ .xword 0x27e05a6e926952cc, 0x94ce16d091ce0da3
+ .xword 0xd3646c393a29f297, 0x604a2087398eadf8
+ .xword 0x5c3099ea6de60cff, 0xef1ed5546e415390
+ .xword 0xa8b4afbdc5a6aca4, 0x1b9ae303c601f3cb
+ .xword 0x56ed3e2f9e224471, 0xe5c372919d851b1e
+ .xword 0xa26908783662e42a, 0x114744c635c5bb45
+ .xword 0x2d3dfdab61ad1a42, 0x9e13b115620a452d
+ .xword 0xd9b9cbfcc9edba19, 0x6a978742ca4ae576
+ .xword 0xa14cb926613cf817, 0x1262f598629ba778
+ .xword 0x55c88f71c97c584c, 0xe6e6c3cfcadb0723
+ .xword 0xda9c7aa29eb3a624, 0x69b2361c9d14f94b
+ .xword 0x2e184cf536f3067f, 0x9d36004b35545910
+ .xword 0x2b769f17cf112238, 0x9858d3a9ccb67d57
+ .xword 0xdff2a94067518263, 0x6cdce5fe64f6dd0c
+ .xword 0x50a65c93309e7c0b, 0xe388102d33392364
+ .xword 0xa4226ac498dedc50, 0x170c267a9b79833f
+ .xword 0xdcd7181e300f9e5e, 0x6ff954a033a8c131
+ .xword 0x28532e49984f3e05, 0x9b7d62f79be8616a
+ .xword 0xa707db9acf80c06d, 0x14299724cc279f02
+ .xword 0x5383edcd67c06036, 0xe0ada17364673f59
diff --git a/src/isa-l/crc/aarch64/crc64_iso_norm_pmull.S b/src/isa-l/crc/aarch64/crc64_iso_norm_pmull.S
new file mode 100644
index 000000000..185b75bdf
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_iso_norm_pmull.S
@@ -0,0 +1,33 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc64_iso_norm_pmull.h"
+#include "crc64_norm_common_pmull.h"
+
+crc64_norm_func crc64_iso_norm_pmull
diff --git a/src/isa-l/crc/aarch64/crc64_iso_norm_pmull.h b/src/isa-l/crc/aarch64/crc64_iso_norm_pmull.h
new file mode 100644
index 000000000..cc176051c
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_iso_norm_pmull.h
@@ -0,0 +1,201 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.equ p4_low_b0, (0x0101)
+.equ p4_low_b1, 0x0100
+.equ p4_low_b2, 0x0001
+.equ p4_low_b3, 0x0000
+.equ p4_high_b0, 0x1b1b
+.equ p4_high_b1, 0x1b00
+.equ p4_high_b2, 0x001b
+.equ p4_high_b3, 0x0000
+
+.equ p1_low_b0, (0x0145)
+.equ p1_low_b1, 0x0000
+.equ p1_low_b2, 0x0000
+.equ p1_low_b3, 0x0000
+.equ p1_high_b0, 0x1db7
+.equ p1_high_b1, 0x0000
+.equ p1_high_b2, 0x0000
+.equ p1_high_b3, 0x0000
+
+.equ p0_low_b0, (0x0145)
+.equ p0_low_b1, 0x0000
+.equ p0_low_b2, 0x0000
+.equ p0_low_b3, 0x0000
+.equ p0_high_b0, 0x0000
+.equ p0_high_b1, 0x0000
+.equ p0_high_b2, 0x0000
+.equ p0_high_b3, 0x0000
+
+.equ br_low_b0, (0x001b)
+.equ br_low_b1, 0x0000
+.equ br_low_b2, 0x0000
+.equ br_low_b3, 0x0000
+.equ br_high_b0, 0x001b
+.equ br_high_b1, 0x0000
+.equ br_high_b2, 0x0000
+.equ br_high_b3, 0x0000
+
+ .text
+ .section .rodata
+ .align 4
+ .set .lanchor_crc_tab,. + 0
+ .type crc64_tab, %object
+ .size crc64_tab, 2048
+
+crc64_tab:
+ .xword 0x0000000000000000, 0x000000000000001b
+ .xword 0x0000000000000036, 0x000000000000002d
+ .xword 0x000000000000006c, 0x0000000000000077
+ .xword 0x000000000000005a, 0x0000000000000041
+ .xword 0x00000000000000d8, 0x00000000000000c3
+ .xword 0x00000000000000ee, 0x00000000000000f5
+ .xword 0x00000000000000b4, 0x00000000000000af
+ .xword 0x0000000000000082, 0x0000000000000099
+ .xword 0x00000000000001b0, 0x00000000000001ab
+ .xword 0x0000000000000186, 0x000000000000019d
+ .xword 0x00000000000001dc, 0x00000000000001c7
+ .xword 0x00000000000001ea, 0x00000000000001f1
+ .xword 0x0000000000000168, 0x0000000000000173
+ .xword 0x000000000000015e, 0x0000000000000145
+ .xword 0x0000000000000104, 0x000000000000011f
+ .xword 0x0000000000000132, 0x0000000000000129
+ .xword 0x0000000000000360, 0x000000000000037b
+ .xword 0x0000000000000356, 0x000000000000034d
+ .xword 0x000000000000030c, 0x0000000000000317
+ .xword 0x000000000000033a, 0x0000000000000321
+ .xword 0x00000000000003b8, 0x00000000000003a3
+ .xword 0x000000000000038e, 0x0000000000000395
+ .xword 0x00000000000003d4, 0x00000000000003cf
+ .xword 0x00000000000003e2, 0x00000000000003f9
+ .xword 0x00000000000002d0, 0x00000000000002cb
+ .xword 0x00000000000002e6, 0x00000000000002fd
+ .xword 0x00000000000002bc, 0x00000000000002a7
+ .xword 0x000000000000028a, 0x0000000000000291
+ .xword 0x0000000000000208, 0x0000000000000213
+ .xword 0x000000000000023e, 0x0000000000000225
+ .xword 0x0000000000000264, 0x000000000000027f
+ .xword 0x0000000000000252, 0x0000000000000249
+ .xword 0x00000000000006c0, 0x00000000000006db
+ .xword 0x00000000000006f6, 0x00000000000006ed
+ .xword 0x00000000000006ac, 0x00000000000006b7
+ .xword 0x000000000000069a, 0x0000000000000681
+ .xword 0x0000000000000618, 0x0000000000000603
+ .xword 0x000000000000062e, 0x0000000000000635
+ .xword 0x0000000000000674, 0x000000000000066f
+ .xword 0x0000000000000642, 0x0000000000000659
+ .xword 0x0000000000000770, 0x000000000000076b
+ .xword 0x0000000000000746, 0x000000000000075d
+ .xword 0x000000000000071c, 0x0000000000000707
+ .xword 0x000000000000072a, 0x0000000000000731
+ .xword 0x00000000000007a8, 0x00000000000007b3
+ .xword 0x000000000000079e, 0x0000000000000785
+ .xword 0x00000000000007c4, 0x00000000000007df
+ .xword 0x00000000000007f2, 0x00000000000007e9
+ .xword 0x00000000000005a0, 0x00000000000005bb
+ .xword 0x0000000000000596, 0x000000000000058d
+ .xword 0x00000000000005cc, 0x00000000000005d7
+ .xword 0x00000000000005fa, 0x00000000000005e1
+ .xword 0x0000000000000578, 0x0000000000000563
+ .xword 0x000000000000054e, 0x0000000000000555
+ .xword 0x0000000000000514, 0x000000000000050f
+ .xword 0x0000000000000522, 0x0000000000000539
+ .xword 0x0000000000000410, 0x000000000000040b
+ .xword 0x0000000000000426, 0x000000000000043d
+ .xword 0x000000000000047c, 0x0000000000000467
+ .xword 0x000000000000044a, 0x0000000000000451
+ .xword 0x00000000000004c8, 0x00000000000004d3
+ .xword 0x00000000000004fe, 0x00000000000004e5
+ .xword 0x00000000000004a4, 0x00000000000004bf
+ .xword 0x0000000000000492, 0x0000000000000489
+ .xword 0x0000000000000d80, 0x0000000000000d9b
+ .xword 0x0000000000000db6, 0x0000000000000dad
+ .xword 0x0000000000000dec, 0x0000000000000df7
+ .xword 0x0000000000000dda, 0x0000000000000dc1
+ .xword 0x0000000000000d58, 0x0000000000000d43
+ .xword 0x0000000000000d6e, 0x0000000000000d75
+ .xword 0x0000000000000d34, 0x0000000000000d2f
+ .xword 0x0000000000000d02, 0x0000000000000d19
+ .xword 0x0000000000000c30, 0x0000000000000c2b
+ .xword 0x0000000000000c06, 0x0000000000000c1d
+ .xword 0x0000000000000c5c, 0x0000000000000c47
+ .xword 0x0000000000000c6a, 0x0000000000000c71
+ .xword 0x0000000000000ce8, 0x0000000000000cf3
+ .xword 0x0000000000000cde, 0x0000000000000cc5
+ .xword 0x0000000000000c84, 0x0000000000000c9f
+ .xword 0x0000000000000cb2, 0x0000000000000ca9
+ .xword 0x0000000000000ee0, 0x0000000000000efb
+ .xword 0x0000000000000ed6, 0x0000000000000ecd
+ .xword 0x0000000000000e8c, 0x0000000000000e97
+ .xword 0x0000000000000eba, 0x0000000000000ea1
+ .xword 0x0000000000000e38, 0x0000000000000e23
+ .xword 0x0000000000000e0e, 0x0000000000000e15
+ .xword 0x0000000000000e54, 0x0000000000000e4f
+ .xword 0x0000000000000e62, 0x0000000000000e79
+ .xword 0x0000000000000f50, 0x0000000000000f4b
+ .xword 0x0000000000000f66, 0x0000000000000f7d
+ .xword 0x0000000000000f3c, 0x0000000000000f27
+ .xword 0x0000000000000f0a, 0x0000000000000f11
+ .xword 0x0000000000000f88, 0x0000000000000f93
+ .xword 0x0000000000000fbe, 0x0000000000000fa5
+ .xword 0x0000000000000fe4, 0x0000000000000fff
+ .xword 0x0000000000000fd2, 0x0000000000000fc9
+ .xword 0x0000000000000b40, 0x0000000000000b5b
+ .xword 0x0000000000000b76, 0x0000000000000b6d
+ .xword 0x0000000000000b2c, 0x0000000000000b37
+ .xword 0x0000000000000b1a, 0x0000000000000b01
+ .xword 0x0000000000000b98, 0x0000000000000b83
+ .xword 0x0000000000000bae, 0x0000000000000bb5
+ .xword 0x0000000000000bf4, 0x0000000000000bef
+ .xword 0x0000000000000bc2, 0x0000000000000bd9
+ .xword 0x0000000000000af0, 0x0000000000000aeb
+ .xword 0x0000000000000ac6, 0x0000000000000add
+ .xword 0x0000000000000a9c, 0x0000000000000a87
+ .xword 0x0000000000000aaa, 0x0000000000000ab1
+ .xword 0x0000000000000a28, 0x0000000000000a33
+ .xword 0x0000000000000a1e, 0x0000000000000a05
+ .xword 0x0000000000000a44, 0x0000000000000a5f
+ .xword 0x0000000000000a72, 0x0000000000000a69
+ .xword 0x0000000000000820, 0x000000000000083b
+ .xword 0x0000000000000816, 0x000000000000080d
+ .xword 0x000000000000084c, 0x0000000000000857
+ .xword 0x000000000000087a, 0x0000000000000861
+ .xword 0x00000000000008f8, 0x00000000000008e3
+ .xword 0x00000000000008ce, 0x00000000000008d5
+ .xword 0x0000000000000894, 0x000000000000088f
+ .xword 0x00000000000008a2, 0x00000000000008b9
+ .xword 0x0000000000000990, 0x000000000000098b
+ .xword 0x00000000000009a6, 0x00000000000009bd
+ .xword 0x00000000000009fc, 0x00000000000009e7
+ .xword 0x00000000000009ca, 0x00000000000009d1
+ .xword 0x0000000000000948, 0x0000000000000953
+ .xword 0x000000000000097e, 0x0000000000000965
+ .xword 0x0000000000000924, 0x000000000000093f
+ .xword 0x0000000000000912, 0x0000000000000909
diff --git a/src/isa-l/crc/aarch64/crc64_iso_refl_pmull.S b/src/isa-l/crc/aarch64/crc64_iso_refl_pmull.S
new file mode 100644
index 000000000..2d2bc6658
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_iso_refl_pmull.S
@@ -0,0 +1,33 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc64_iso_refl_pmull.h"
+#include "crc64_refl_common_pmull.h"
+
+crc64_refl_func crc64_iso_refl_pmull
diff --git a/src/isa-l/crc/aarch64/crc64_iso_refl_pmull.h b/src/isa-l/crc/aarch64/crc64_iso_refl_pmull.h
new file mode 100644
index 000000000..8ee4f58b1
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_iso_refl_pmull.h
@@ -0,0 +1,197 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.equ p4_low_b0, 0x0001
+.equ p4_low_b1, 0xb000
+.equ p4_low_b2, 0x01b1
+.equ p4_low_b3, 0x01b0
+.equ p4_high_b0, 0x0001
+.equ p4_high_b1, 0x0000
+.equ p4_high_b2, 0x0101
+.equ p4_high_b3, 0xb100
+
+.equ p1_low_b0, 0x0001
+.equ p1_low_b1, 0x0000
+.equ p1_low_b2, 0x0000
+.equ p1_low_b3, 0x6b70
+.equ p1_high_b0, 0x0001
+.equ p1_high_b1, 0x0000
+.equ p1_high_b2, 0x0000
+.equ p1_high_b3, 0xf500
+
+.equ p0_low_b0, 0x0001
+.equ p0_low_b1, 0x0000
+.equ p0_low_b2, 0x0000
+.equ p0_low_b3, 0xf500
+
+.equ br_low_b0, 0x0001
+.equ br_low_b1, 0x0000
+.equ br_low_b2, 0x0000
+.equ br_low_b3, 0xb000
+.equ br_high_b0, 0x0001
+.equ br_high_b1, 0x0000
+.equ br_high_b2, 0x0000
+.equ br_high_b3, 0xb000
+
+ .text
+ .section .rodata
+ .align 4
+ .set .lanchor_crc_tab,. + 0
+ .type crc64_tab, %object
+ .size crc64_tab, 2048
+
+crc64_tab:
+ .xword 0x0000000000000000, 0x01b0000000000000
+ .xword 0x0360000000000000, 0x02d0000000000000
+ .xword 0x06c0000000000000, 0x0770000000000000
+ .xword 0x05a0000000000000, 0x0410000000000000
+ .xword 0x0d80000000000000, 0x0c30000000000000
+ .xword 0x0ee0000000000000, 0x0f50000000000000
+ .xword 0x0b40000000000000, 0x0af0000000000000
+ .xword 0x0820000000000000, 0x0990000000000000
+ .xword 0x1b00000000000000, 0x1ab0000000000000
+ .xword 0x1860000000000000, 0x19d0000000000000
+ .xword 0x1dc0000000000000, 0x1c70000000000000
+ .xword 0x1ea0000000000000, 0x1f10000000000000
+ .xword 0x1680000000000000, 0x1730000000000000
+ .xword 0x15e0000000000000, 0x1450000000000000
+ .xword 0x1040000000000000, 0x11f0000000000000
+ .xword 0x1320000000000000, 0x1290000000000000
+ .xword 0x3600000000000000, 0x37b0000000000000
+ .xword 0x3560000000000000, 0x34d0000000000000
+ .xword 0x30c0000000000000, 0x3170000000000000
+ .xword 0x33a0000000000000, 0x3210000000000000
+ .xword 0x3b80000000000000, 0x3a30000000000000
+ .xword 0x38e0000000000000, 0x3950000000000000
+ .xword 0x3d40000000000000, 0x3cf0000000000000
+ .xword 0x3e20000000000000, 0x3f90000000000000
+ .xword 0x2d00000000000000, 0x2cb0000000000000
+ .xword 0x2e60000000000000, 0x2fd0000000000000
+ .xword 0x2bc0000000000000, 0x2a70000000000000
+ .xword 0x28a0000000000000, 0x2910000000000000
+ .xword 0x2080000000000000, 0x2130000000000000
+ .xword 0x23e0000000000000, 0x2250000000000000
+ .xword 0x2640000000000000, 0x27f0000000000000
+ .xword 0x2520000000000000, 0x2490000000000000
+ .xword 0x6c00000000000000, 0x6db0000000000000
+ .xword 0x6f60000000000000, 0x6ed0000000000000
+ .xword 0x6ac0000000000000, 0x6b70000000000000
+ .xword 0x69a0000000000000, 0x6810000000000000
+ .xword 0x6180000000000000, 0x6030000000000000
+ .xword 0x62e0000000000000, 0x6350000000000000
+ .xword 0x6740000000000000, 0x66f0000000000000
+ .xword 0x6420000000000000, 0x6590000000000000
+ .xword 0x7700000000000000, 0x76b0000000000000
+ .xword 0x7460000000000000, 0x75d0000000000000
+ .xword 0x71c0000000000000, 0x7070000000000000
+ .xword 0x72a0000000000000, 0x7310000000000000
+ .xword 0x7a80000000000000, 0x7b30000000000000
+ .xword 0x79e0000000000000, 0x7850000000000000
+ .xword 0x7c40000000000000, 0x7df0000000000000
+ .xword 0x7f20000000000000, 0x7e90000000000000
+ .xword 0x5a00000000000000, 0x5bb0000000000000
+ .xword 0x5960000000000000, 0x58d0000000000000
+ .xword 0x5cc0000000000000, 0x5d70000000000000
+ .xword 0x5fa0000000000000, 0x5e10000000000000
+ .xword 0x5780000000000000, 0x5630000000000000
+ .xword 0x54e0000000000000, 0x5550000000000000
+ .xword 0x5140000000000000, 0x50f0000000000000
+ .xword 0x5220000000000000, 0x5390000000000000
+ .xword 0x4100000000000000, 0x40b0000000000000
+ .xword 0x4260000000000000, 0x43d0000000000000
+ .xword 0x47c0000000000000, 0x4670000000000000
+ .xword 0x44a0000000000000, 0x4510000000000000
+ .xword 0x4c80000000000000, 0x4d30000000000000
+ .xword 0x4fe0000000000000, 0x4e50000000000000
+ .xword 0x4a40000000000000, 0x4bf0000000000000
+ .xword 0x4920000000000000, 0x4890000000000000
+ .xword 0xd800000000000000, 0xd9b0000000000000
+ .xword 0xdb60000000000000, 0xdad0000000000000
+ .xword 0xdec0000000000000, 0xdf70000000000000
+ .xword 0xdda0000000000000, 0xdc10000000000000
+ .xword 0xd580000000000000, 0xd430000000000000
+ .xword 0xd6e0000000000000, 0xd750000000000000
+ .xword 0xd340000000000000, 0xd2f0000000000000
+ .xword 0xd020000000000000, 0xd190000000000000
+ .xword 0xc300000000000000, 0xc2b0000000000000
+ .xword 0xc060000000000000, 0xc1d0000000000000
+ .xword 0xc5c0000000000000, 0xc470000000000000
+ .xword 0xc6a0000000000000, 0xc710000000000000
+ .xword 0xce80000000000000, 0xcf30000000000000
+ .xword 0xcde0000000000000, 0xcc50000000000000
+ .xword 0xc840000000000000, 0xc9f0000000000000
+ .xword 0xcb20000000000000, 0xca90000000000000
+ .xword 0xee00000000000000, 0xefb0000000000000
+ .xword 0xed60000000000000, 0xecd0000000000000
+ .xword 0xe8c0000000000000, 0xe970000000000000
+ .xword 0xeba0000000000000, 0xea10000000000000
+ .xword 0xe380000000000000, 0xe230000000000000
+ .xword 0xe0e0000000000000, 0xe150000000000000
+ .xword 0xe540000000000000, 0xe4f0000000000000
+ .xword 0xe620000000000000, 0xe790000000000000
+ .xword 0xf500000000000000, 0xf4b0000000000000
+ .xword 0xf660000000000000, 0xf7d0000000000000
+ .xword 0xf3c0000000000000, 0xf270000000000000
+ .xword 0xf0a0000000000000, 0xf110000000000000
+ .xword 0xf880000000000000, 0xf930000000000000
+ .xword 0xfbe0000000000000, 0xfa50000000000000
+ .xword 0xfe40000000000000, 0xfff0000000000000
+ .xword 0xfd20000000000000, 0xfc90000000000000
+ .xword 0xb400000000000000, 0xb5b0000000000000
+ .xword 0xb760000000000000, 0xb6d0000000000000
+ .xword 0xb2c0000000000000, 0xb370000000000000
+ .xword 0xb1a0000000000000, 0xb010000000000000
+ .xword 0xb980000000000000, 0xb830000000000000
+ .xword 0xbae0000000000000, 0xbb50000000000000
+ .xword 0xbf40000000000000, 0xbef0000000000000
+ .xword 0xbc20000000000000, 0xbd90000000000000
+ .xword 0xaf00000000000000, 0xaeb0000000000000
+ .xword 0xac60000000000000, 0xadd0000000000000
+ .xword 0xa9c0000000000000, 0xa870000000000000
+ .xword 0xaaa0000000000000, 0xab10000000000000
+ .xword 0xa280000000000000, 0xa330000000000000
+ .xword 0xa1e0000000000000, 0xa050000000000000
+ .xword 0xa440000000000000, 0xa5f0000000000000
+ .xword 0xa720000000000000, 0xa690000000000000
+ .xword 0x8200000000000000, 0x83b0000000000000
+ .xword 0x8160000000000000, 0x80d0000000000000
+ .xword 0x84c0000000000000, 0x8570000000000000
+ .xword 0x87a0000000000000, 0x8610000000000000
+ .xword 0x8f80000000000000, 0x8e30000000000000
+ .xword 0x8ce0000000000000, 0x8d50000000000000
+ .xword 0x8940000000000000, 0x88f0000000000000
+ .xword 0x8a20000000000000, 0x8b90000000000000
+ .xword 0x9900000000000000, 0x98b0000000000000
+ .xword 0x9a60000000000000, 0x9bd0000000000000
+ .xword 0x9fc0000000000000, 0x9e70000000000000
+ .xword 0x9ca0000000000000, 0x9d10000000000000
+ .xword 0x9480000000000000, 0x9530000000000000
+ .xword 0x97e0000000000000, 0x9650000000000000
+ .xword 0x9240000000000000, 0x93f0000000000000
+ .xword 0x9120000000000000, 0x9090000000000000
diff --git a/src/isa-l/crc/aarch64/crc64_jones_norm_pmull.S b/src/isa-l/crc/aarch64/crc64_jones_norm_pmull.S
new file mode 100644
index 000000000..4f298376c
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_jones_norm_pmull.S
@@ -0,0 +1,33 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc64_jones_norm_pmull.h"
+#include "crc64_norm_common_pmull.h"
+
+crc64_norm_func crc64_jones_norm_pmull
diff --git a/src/isa-l/crc/aarch64/crc64_jones_norm_pmull.h b/src/isa-l/crc/aarch64/crc64_jones_norm_pmull.h
new file mode 100644
index 000000000..a20fa045d
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_jones_norm_pmull.h
@@ -0,0 +1,200 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.equ p4_low_b0, (0xd25e)
+.equ p4_low_b1, 0xca43
+.equ p4_low_b2, 0x1e58
+.equ p4_low_b3, 0x4e50
+.equ p4_high_b0, 0xf643
+.equ p4_high_b1, 0x8f27
+.equ p4_high_b2, 0x6158
+.equ p4_high_b3, 0x13c9
+
+.equ p1_low_b0, (0x7038)
+.equ p1_low_b1, 0x5001
+.equ p1_low_b2, 0xed27
+.equ p1_low_b3, 0x4445
+.equ p1_high_b0, 0xd736
+.equ p1_high_b1, 0x7cfb
+.equ p1_high_b2, 0x7415
+.equ p1_high_b3, 0x698b
+
+.equ p0_low_b0, (0x7038)
+.equ p0_low_b1, 0x5001
+.equ p0_low_b2, 0xed27
+.equ p0_low_b3, 0x4445
+.equ p0_high_b0, 0x0000
+.equ p0_high_b1, 0x0000
+.equ p0_high_b2, 0x0000
+.equ p0_high_b3, 0x0000
+
+.equ br_low_b0, (0x6cf8)
+.equ br_low_b1, 0x98be
+.equ br_low_b2, 0xeeb2
+.equ br_low_b3, 0xddf3
+.equ br_high_b0, 0x35a9
+.equ br_high_b1, 0x94c9
+.equ br_high_b2, 0xd235
+.equ br_high_b3, 0xad93
+
+ .text
+ .section .rodata
+ .align 4
+ .set .lanchor_crc_tab,. + 0
+ .type crc64_tab, %object
+ .size crc64_tab, 2048
+crc64_tab:
+ .xword 0x0000000000000000, 0xad93d23594c935a9
+ .xword 0xf6b4765ebd5b5efb, 0x5b27a46b29926b52
+ .xword 0x40fb3e88ee7f885f, 0xed68ecbd7ab6bdf6
+ .xword 0xb64f48d65324d6a4, 0x1bdc9ae3c7ede30d
+ .xword 0x81f67d11dcff10be, 0x2c65af2448362517
+ .xword 0x77420b4f61a44e45, 0xdad1d97af56d7bec
+ .xword 0xc10d4399328098e1, 0x6c9e91aca649ad48
+ .xword 0x37b935c78fdbc61a, 0x9a2ae7f21b12f3b3
+ .xword 0xae7f28162d3714d5, 0x03ecfa23b9fe217c
+ .xword 0x58cb5e48906c4a2e, 0xf5588c7d04a57f87
+ .xword 0xee84169ec3489c8a, 0x4317c4ab5781a923
+ .xword 0x183060c07e13c271, 0xb5a3b2f5eadaf7d8
+ .xword 0x2f895507f1c8046b, 0x821a8732650131c2
+ .xword 0xd93d23594c935a90, 0x74aef16cd85a6f39
+ .xword 0x6f726b8f1fb78c34, 0xc2e1b9ba8b7eb99d
+ .xword 0x99c61dd1a2ecd2cf, 0x3455cfe43625e766
+ .xword 0xf16d8219cea71c03, 0x5cfe502c5a6e29aa
+ .xword 0x07d9f44773fc42f8, 0xaa4a2672e7357751
+ .xword 0xb196bc9120d8945c, 0x1c056ea4b411a1f5
+ .xword 0x4722cacf9d83caa7, 0xeab118fa094aff0e
+ .xword 0x709bff0812580cbd, 0xdd082d3d86913914
+ .xword 0x862f8956af035246, 0x2bbc5b633bca67ef
+ .xword 0x3060c180fc2784e2, 0x9df313b568eeb14b
+ .xword 0xc6d4b7de417cda19, 0x6b4765ebd5b5efb0
+ .xword 0x5f12aa0fe39008d6, 0xf281783a77593d7f
+ .xword 0xa9a6dc515ecb562d, 0x04350e64ca026384
+ .xword 0x1fe994870def8089, 0xb27a46b29926b520
+ .xword 0xe95de2d9b0b4de72, 0x44ce30ec247debdb
+ .xword 0xdee4d71e3f6f1868, 0x7377052baba62dc1
+ .xword 0x2850a14082344693, 0x85c3737516fd733a
+ .xword 0x9e1fe996d1109037, 0x338c3ba345d9a59e
+ .xword 0x68ab9fc86c4bcecc, 0xc5384dfdf882fb65
+ .xword 0x4f48d60609870daf, 0xe2db04339d4e3806
+ .xword 0xb9fca058b4dc5354, 0x146f726d201566fd
+ .xword 0x0fb3e88ee7f885f0, 0xa2203abb7331b059
+ .xword 0xf9079ed05aa3db0b, 0x54944ce5ce6aeea2
+ .xword 0xcebeab17d5781d11, 0x632d792241b128b8
+ .xword 0x380add49682343ea, 0x95990f7cfcea7643
+ .xword 0x8e45959f3b07954e, 0x23d647aaafcea0e7
+ .xword 0x78f1e3c1865ccbb5, 0xd56231f41295fe1c
+ .xword 0xe137fe1024b0197a, 0x4ca42c25b0792cd3
+ .xword 0x1783884e99eb4781, 0xba105a7b0d227228
+ .xword 0xa1ccc098cacf9125, 0x0c5f12ad5e06a48c
+ .xword 0x5778b6c67794cfde, 0xfaeb64f3e35dfa77
+ .xword 0x60c18301f84f09c4, 0xcd5251346c863c6d
+ .xword 0x9675f55f4514573f, 0x3be6276ad1dd6296
+ .xword 0x203abd891630819b, 0x8da96fbc82f9b432
+ .xword 0xd68ecbd7ab6bdf60, 0x7b1d19e23fa2eac9
+ .xword 0xbe25541fc72011ac, 0x13b6862a53e92405
+ .xword 0x489122417a7b4f57, 0xe502f074eeb27afe
+ .xword 0xfede6a97295f99f3, 0x534db8a2bd96ac5a
+ .xword 0x086a1cc99404c708, 0xa5f9cefc00cdf2a1
+ .xword 0x3fd3290e1bdf0112, 0x9240fb3b8f1634bb
+ .xword 0xc9675f50a6845fe9, 0x64f48d65324d6a40
+ .xword 0x7f281786f5a0894d, 0xd2bbc5b36169bce4
+ .xword 0x899c61d848fbd7b6, 0x240fb3eddc32e21f
+ .xword 0x105a7c09ea170579, 0xbdc9ae3c7ede30d0
+ .xword 0xe6ee0a57574c5b82, 0x4b7dd862c3856e2b
+ .xword 0x50a1428104688d26, 0xfd3290b490a1b88f
+ .xword 0xa61534dfb933d3dd, 0x0b86e6ea2dfae674
+ .xword 0x91ac011836e815c7, 0x3c3fd32da221206e
+ .xword 0x671877468bb34b3c, 0xca8ba5731f7a7e95
+ .xword 0xd1573f90d8979d98, 0x7cc4eda54c5ea831
+ .xword 0x27e349ce65ccc363, 0x8a709bfbf105f6ca
+ .xword 0x9e91ac0c130e1b5e, 0x33027e3987c72ef7
+ .xword 0x6825da52ae5545a5, 0xc5b608673a9c700c
+ .xword 0xde6a9284fd719301, 0x73f940b169b8a6a8
+ .xword 0x28dee4da402acdfa, 0x854d36efd4e3f853
+ .xword 0x1f67d11dcff10be0, 0xb2f403285b383e49
+ .xword 0xe9d3a74372aa551b, 0x44407576e66360b2
+ .xword 0x5f9cef95218e83bf, 0xf20f3da0b547b616
+ .xword 0xa92899cb9cd5dd44, 0x04bb4bfe081ce8ed
+ .xword 0x30ee841a3e390f8b, 0x9d7d562faaf03a22
+ .xword 0xc65af24483625170, 0x6bc9207117ab64d9
+ .xword 0x7015ba92d04687d4, 0xdd8668a7448fb27d
+ .xword 0x86a1cccc6d1dd92f, 0x2b321ef9f9d4ec86
+ .xword 0xb118f90be2c61f35, 0x1c8b2b3e760f2a9c
+ .xword 0x47ac8f555f9d41ce, 0xea3f5d60cb547467
+ .xword 0xf1e3c7830cb9976a, 0x5c7015b69870a2c3
+ .xword 0x0757b1ddb1e2c991, 0xaac463e8252bfc38
+ .xword 0x6ffc2e15dda9075d, 0xc26ffc20496032f4
+ .xword 0x9948584b60f259a6, 0x34db8a7ef43b6c0f
+ .xword 0x2f07109d33d68f02, 0x8294c2a8a71fbaab
+ .xword 0xd9b366c38e8dd1f9, 0x7420b4f61a44e450
+ .xword 0xee0a5304015617e3, 0x43998131959f224a
+ .xword 0x18be255abc0d4918, 0xb52df76f28c47cb1
+ .xword 0xaef16d8cef299fbc, 0x0362bfb97be0aa15
+ .xword 0x58451bd25272c147, 0xf5d6c9e7c6bbf4ee
+ .xword 0xc1830603f09e1388, 0x6c10d43664572621
+ .xword 0x3737705d4dc54d73, 0x9aa4a268d90c78da
+ .xword 0x8178388b1ee19bd7, 0x2cebeabe8a28ae7e
+ .xword 0x77cc4ed5a3bac52c, 0xda5f9ce03773f085
+ .xword 0x40757b122c610336, 0xede6a927b8a8369f
+ .xword 0xb6c10d4c913a5dcd, 0x1b52df7905f36864
+ .xword 0x008e459ac21e8b69, 0xad1d97af56d7bec0
+ .xword 0xf63a33c47f45d592, 0x5ba9e1f1eb8ce03b
+ .xword 0xd1d97a0a1a8916f1, 0x7c4aa83f8e402358
+ .xword 0x276d0c54a7d2480a, 0x8afede61331b7da3
+ .xword 0x91224482f4f69eae, 0x3cb196b7603fab07
+ .xword 0x679632dc49adc055, 0xca05e0e9dd64f5fc
+ .xword 0x502f071bc676064f, 0xfdbcd52e52bf33e6
+ .xword 0xa69b71457b2d58b4, 0x0b08a370efe46d1d
+ .xword 0x10d4399328098e10, 0xbd47eba6bcc0bbb9
+ .xword 0xe6604fcd9552d0eb, 0x4bf39df8019be542
+ .xword 0x7fa6521c37be0224, 0xd2358029a377378d
+ .xword 0x891224428ae55cdf, 0x2481f6771e2c6976
+ .xword 0x3f5d6c94d9c18a7b, 0x92cebea14d08bfd2
+ .xword 0xc9e91aca649ad480, 0x647ac8fff053e129
+ .xword 0xfe502f0deb41129a, 0x53c3fd387f882733
+ .xword 0x08e45953561a4c61, 0xa5778b66c2d379c8
+ .xword 0xbeab1185053e9ac5, 0x1338c3b091f7af6c
+ .xword 0x481f67dbb865c43e, 0xe58cb5ee2cacf197
+ .xword 0x20b4f813d42e0af2, 0x8d272a2640e73f5b
+ .xword 0xd6008e4d69755409, 0x7b935c78fdbc61a0
+ .xword 0x604fc69b3a5182ad, 0xcddc14aeae98b704
+ .xword 0x96fbb0c5870adc56, 0x3b6862f013c3e9ff
+ .xword 0xa142850208d11a4c, 0x0cd157379c182fe5
+ .xword 0x57f6f35cb58a44b7, 0xfa6521692143711e
+ .xword 0xe1b9bb8ae6ae9213, 0x4c2a69bf7267a7ba
+ .xword 0x170dcdd45bf5cce8, 0xba9e1fe1cf3cf941
+ .xword 0x8ecbd005f9191e27, 0x235802306dd02b8e
+ .xword 0x787fa65b444240dc, 0xd5ec746ed08b7575
+ .xword 0xce30ee8d17669678, 0x63a33cb883afa3d1
+ .xword 0x388498d3aa3dc883, 0x95174ae63ef4fd2a
+ .xword 0x0f3dad1425e60e99, 0xa2ae7f21b12f3b30
+ .xword 0xf989db4a98bd5062, 0x541a097f0c7465cb
+ .xword 0x4fc6939ccb9986c6, 0xe25541a95f50b36f
+ .xword 0xb972e5c276c2d83d, 0x14e137f7e20bed94
diff --git a/src/isa-l/crc/aarch64/crc64_jones_refl_pmull.S b/src/isa-l/crc/aarch64/crc64_jones_refl_pmull.S
new file mode 100644
index 000000000..177092f9f
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_jones_refl_pmull.S
@@ -0,0 +1,33 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc64_jones_refl_pmull.h"
+#include "crc64_refl_common_pmull.h"
+
+crc64_refl_func crc64_jones_refl_pmull
diff --git a/src/isa-l/crc/aarch64/crc64_jones_refl_pmull.h b/src/isa-l/crc/aarch64/crc64_jones_refl_pmull.h
new file mode 100644
index 000000000..5bf98f73e
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_jones_refl_pmull.h
@@ -0,0 +1,196 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.equ p4_low_b0, 0xb4fb
+.equ p4_low_b1, 0x6d9a
+.equ p4_low_b2, 0xefb1
+.equ p4_low_b3, 0xaf86
+.equ p4_high_b0, 0x14e4
+.equ p4_high_b1, 0x34f0
+.equ p4_high_b2, 0x84a6
+.equ p4_high_b3, 0xf497
+
+.equ p1_low_b0, 0xa32c
+.equ p1_low_b1, 0x505d
+.equ p1_low_b2, 0xbe7d
+.equ p1_low_b3, 0xd9d7
+.equ p1_high_b0, 0x4444
+.equ p1_high_b1, 0xc96f
+.equ p1_high_b2, 0x0015
+.equ p1_high_b3, 0x381d
+
+.equ p0_low_b0, 0x4444
+.equ p0_low_b1, 0xc96f
+.equ p0_low_b2, 0x0015
+.equ p0_low_b3, 0x381d
+
+.equ br_low_b0, 0x9f77
+.equ br_low_b1, 0x9aef
+.equ br_low_b2, 0xfa32
+.equ br_low_b3, 0x3e6c
+.equ br_high_b0, 0x936b
+.equ br_high_b1, 0x5897
+.equ br_high_b2, 0x2653
+.equ br_high_b3, 0x2b59
+
+ .text
+ .section .rodata
+ .align 4
+ .set .lanchor_crc_tab,. + 0
+ .type crc64_tab, %object
+ .size crc64_tab, 2048
+crc64_tab:
+ .xword 0x0000000000000000, 0x7ad870c830358979
+ .xword 0xf5b0e190606b12f2, 0x8f689158505e9b8b
+ .xword 0xc038e5739841b68f, 0xbae095bba8743ff6
+ .xword 0x358804e3f82aa47d, 0x4f50742bc81f2d04
+ .xword 0xab28ecb46814fe75, 0xd1f09c7c5821770c
+ .xword 0x5e980d24087fec87, 0x24407dec384a65fe
+ .xword 0x6b1009c7f05548fa, 0x11c8790fc060c183
+ .xword 0x9ea0e857903e5a08, 0xe478989fa00bd371
+ .xword 0x7d08ff3b88be6f81, 0x07d08ff3b88be6f8
+ .xword 0x88b81eabe8d57d73, 0xf2606e63d8e0f40a
+ .xword 0xbd301a4810ffd90e, 0xc7e86a8020ca5077
+ .xword 0x4880fbd87094cbfc, 0x32588b1040a14285
+ .xword 0xd620138fe0aa91f4, 0xacf86347d09f188d
+ .xword 0x2390f21f80c18306, 0x594882d7b0f40a7f
+ .xword 0x1618f6fc78eb277b, 0x6cc0863448deae02
+ .xword 0xe3a8176c18803589, 0x997067a428b5bcf0
+ .xword 0xfa11fe77117cdf02, 0x80c98ebf2149567b
+ .xword 0x0fa11fe77117cdf0, 0x75796f2f41224489
+ .xword 0x3a291b04893d698d, 0x40f16bccb908e0f4
+ .xword 0xcf99fa94e9567b7f, 0xb5418a5cd963f206
+ .xword 0x513912c379682177, 0x2be1620b495da80e
+ .xword 0xa489f35319033385, 0xde51839b2936bafc
+ .xword 0x9101f7b0e12997f8, 0xebd98778d11c1e81
+ .xword 0x64b116208142850a, 0x1e6966e8b1770c73
+ .xword 0x8719014c99c2b083, 0xfdc17184a9f739fa
+ .xword 0x72a9e0dcf9a9a271, 0x08719014c99c2b08
+ .xword 0x4721e43f0183060c, 0x3df994f731b68f75
+ .xword 0xb29105af61e814fe, 0xc849756751dd9d87
+ .xword 0x2c31edf8f1d64ef6, 0x56e99d30c1e3c78f
+ .xword 0xd9810c6891bd5c04, 0xa3597ca0a188d57d
+ .xword 0xec09088b6997f879, 0x96d1784359a27100
+ .xword 0x19b9e91b09fcea8b, 0x636199d339c963f2
+ .xword 0xdf7adabd7a6e2d6f, 0xa5a2aa754a5ba416
+ .xword 0x2aca3b2d1a053f9d, 0x50124be52a30b6e4
+ .xword 0x1f423fcee22f9be0, 0x659a4f06d21a1299
+ .xword 0xeaf2de5e82448912, 0x902aae96b271006b
+ .xword 0x74523609127ad31a, 0x0e8a46c1224f5a63
+ .xword 0x81e2d7997211c1e8, 0xfb3aa75142244891
+ .xword 0xb46ad37a8a3b6595, 0xceb2a3b2ba0eecec
+ .xword 0x41da32eaea507767, 0x3b024222da65fe1e
+ .xword 0xa2722586f2d042ee, 0xd8aa554ec2e5cb97
+ .xword 0x57c2c41692bb501c, 0x2d1ab4dea28ed965
+ .xword 0x624ac0f56a91f461, 0x1892b03d5aa47d18
+ .xword 0x97fa21650afae693, 0xed2251ad3acf6fea
+ .xword 0x095ac9329ac4bc9b, 0x7382b9faaaf135e2
+ .xword 0xfcea28a2faafae69, 0x8632586aca9a2710
+ .xword 0xc9622c4102850a14, 0xb3ba5c8932b0836d
+ .xword 0x3cd2cdd162ee18e6, 0x460abd1952db919f
+ .xword 0x256b24ca6b12f26d, 0x5fb354025b277b14
+ .xword 0xd0dbc55a0b79e09f, 0xaa03b5923b4c69e6
+ .xword 0xe553c1b9f35344e2, 0x9f8bb171c366cd9b
+ .xword 0x10e3202993385610, 0x6a3b50e1a30ddf69
+ .xword 0x8e43c87e03060c18, 0xf49bb8b633338561
+ .xword 0x7bf329ee636d1eea, 0x012b592653589793
+ .xword 0x4e7b2d0d9b47ba97, 0x34a35dc5ab7233ee
+ .xword 0xbbcbcc9dfb2ca865, 0xc113bc55cb19211c
+ .xword 0x5863dbf1e3ac9dec, 0x22bbab39d3991495
+ .xword 0xadd33a6183c78f1e, 0xd70b4aa9b3f20667
+ .xword 0x985b3e827bed2b63, 0xe2834e4a4bd8a21a
+ .xword 0x6debdf121b863991, 0x1733afda2bb3b0e8
+ .xword 0xf34b37458bb86399, 0x8993478dbb8deae0
+ .xword 0x06fbd6d5ebd3716b, 0x7c23a61ddbe6f812
+ .xword 0x3373d23613f9d516, 0x49aba2fe23cc5c6f
+ .xword 0xc6c333a67392c7e4, 0xbc1b436e43a74e9d
+ .xword 0x95ac9329ac4bc9b5, 0xef74e3e19c7e40cc
+ .xword 0x601c72b9cc20db47, 0x1ac40271fc15523e
+ .xword 0x5594765a340a7f3a, 0x2f4c0692043ff643
+ .xword 0xa02497ca54616dc8, 0xdafce7026454e4b1
+ .xword 0x3e847f9dc45f37c0, 0x445c0f55f46abeb9
+ .xword 0xcb349e0da4342532, 0xb1eceec59401ac4b
+ .xword 0xfebc9aee5c1e814f, 0x8464ea266c2b0836
+ .xword 0x0b0c7b7e3c7593bd, 0x71d40bb60c401ac4
+ .xword 0xe8a46c1224f5a634, 0x927c1cda14c02f4d
+ .xword 0x1d148d82449eb4c6, 0x67ccfd4a74ab3dbf
+ .xword 0x289c8961bcb410bb, 0x5244f9a98c8199c2
+ .xword 0xdd2c68f1dcdf0249, 0xa7f41839ecea8b30
+ .xword 0x438c80a64ce15841, 0x3954f06e7cd4d138
+ .xword 0xb63c61362c8a4ab3, 0xcce411fe1cbfc3ca
+ .xword 0x83b465d5d4a0eece, 0xf96c151de49567b7
+ .xword 0x76048445b4cbfc3c, 0x0cdcf48d84fe7545
+ .xword 0x6fbd6d5ebd3716b7, 0x15651d968d029fce
+ .xword 0x9a0d8ccedd5c0445, 0xe0d5fc06ed698d3c
+ .xword 0xaf85882d2576a038, 0xd55df8e515432941
+ .xword 0x5a3569bd451db2ca, 0x20ed197575283bb3
+ .xword 0xc49581ead523e8c2, 0xbe4df122e51661bb
+ .xword 0x3125607ab548fa30, 0x4bfd10b2857d7349
+ .xword 0x04ad64994d625e4d, 0x7e7514517d57d734
+ .xword 0xf11d85092d094cbf, 0x8bc5f5c11d3cc5c6
+ .xword 0x12b5926535897936, 0x686de2ad05bcf04f
+ .xword 0xe70573f555e26bc4, 0x9ddd033d65d7e2bd
+ .xword 0xd28d7716adc8cfb9, 0xa85507de9dfd46c0
+ .xword 0x273d9686cda3dd4b, 0x5de5e64efd965432
+ .xword 0xb99d7ed15d9d8743, 0xc3450e196da80e3a
+ .xword 0x4c2d9f413df695b1, 0x36f5ef890dc31cc8
+ .xword 0x79a59ba2c5dc31cc, 0x037deb6af5e9b8b5
+ .xword 0x8c157a32a5b7233e, 0xf6cd0afa9582aa47
+ .xword 0x4ad64994d625e4da, 0x300e395ce6106da3
+ .xword 0xbf66a804b64ef628, 0xc5bed8cc867b7f51
+ .xword 0x8aeeace74e645255, 0xf036dc2f7e51db2c
+ .xword 0x7f5e4d772e0f40a7, 0x05863dbf1e3ac9de
+ .xword 0xe1fea520be311aaf, 0x9b26d5e88e0493d6
+ .xword 0x144e44b0de5a085d, 0x6e963478ee6f8124
+ .xword 0x21c640532670ac20, 0x5b1e309b16452559
+ .xword 0xd476a1c3461bbed2, 0xaeaed10b762e37ab
+ .xword 0x37deb6af5e9b8b5b, 0x4d06c6676eae0222
+ .xword 0xc26e573f3ef099a9, 0xb8b627f70ec510d0
+ .xword 0xf7e653dcc6da3dd4, 0x8d3e2314f6efb4ad
+ .xword 0x0256b24ca6b12f26, 0x788ec2849684a65f
+ .xword 0x9cf65a1b368f752e, 0xe62e2ad306bafc57
+ .xword 0x6946bb8b56e467dc, 0x139ecb4366d1eea5
+ .xword 0x5ccebf68aecec3a1, 0x2616cfa09efb4ad8
+ .xword 0xa97e5ef8cea5d153, 0xd3a62e30fe90582a
+ .xword 0xb0c7b7e3c7593bd8, 0xca1fc72bf76cb2a1
+ .xword 0x45775673a732292a, 0x3faf26bb9707a053
+ .xword 0x70ff52905f188d57, 0x0a2722586f2d042e
+ .xword 0x854fb3003f739fa5, 0xff97c3c80f4616dc
+ .xword 0x1bef5b57af4dc5ad, 0x61372b9f9f784cd4
+ .xword 0xee5fbac7cf26d75f, 0x9487ca0fff135e26
+ .xword 0xdbd7be24370c7322, 0xa10fceec0739fa5b
+ .xword 0x2e675fb4576761d0, 0x54bf2f7c6752e8a9
+ .xword 0xcdcf48d84fe75459, 0xb71738107fd2dd20
+ .xword 0x387fa9482f8c46ab, 0x42a7d9801fb9cfd2
+ .xword 0x0df7adabd7a6e2d6, 0x772fdd63e7936baf
+ .xword 0xf8474c3bb7cdf024, 0x829f3cf387f8795d
+ .xword 0x66e7a46c27f3aa2c, 0x1c3fd4a417c62355
+ .xword 0x935745fc4798b8de, 0xe98f353477ad31a7
+ .xword 0xa6df411fbfb21ca3, 0xdc0731d78f8795da
+ .xword 0x536fa08fdfd90e51, 0x29b7d047efec8728
diff --git a/src/isa-l/crc/aarch64/crc64_norm_common_pmull.h b/src/isa-l/crc/aarch64/crc64_norm_common_pmull.h
new file mode 100644
index 000000000..1bdfc26b5
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_norm_common_pmull.h
@@ -0,0 +1,129 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc_common_pmull.h"
+
+.macro crc64_norm_func name:req
+ .arch armv8-a+crypto
+ .text
+ .align 3
+ .global \name
+ .type \name, %function
+
+/* uint64_t crc64_norm_func(uint64_t seed, const uint8_t * buf, uint64_t len) */
+
+\name\():
+ mvn x_seed, x_seed
+ mov x_counter, 0
+ cmp x_len, (FOLD_SIZE-1)
+ bhi .crc_clmul_pre
+
+.crc_tab_pre:
+ cmp x_len, x_counter
+ bls .done
+
+ adrp x_tmp, .lanchor_crc_tab
+ add x_buf_iter, x_buf, x_counter
+ add x_buf, x_buf, x_len
+ add x_crc_tab_addr, x_tmp, :lo12:.lanchor_crc_tab
+
+ .align 3
+.loop_crc_tab:
+ ldrb w_tmp, [x_buf_iter], 1
+ cmp x_buf, x_buf_iter
+ eor x_tmp, x_tmp, x_seed, lsr 56
+ ldr x_tmp, [x_crc_tab_addr, x_tmp, lsl 3]
+ eor x_seed, x_tmp, x_seed, lsl 8
+ bne .loop_crc_tab
+
+.done:
+ mvn x_crc_ret, x_seed
+ ret
+
+ .align 2
+.crc_clmul_pre:
+ movi v_x0.2s, 0
+ fmov v_x0.d[1], x_seed // save crc to v_x0
+
+ crc_norm_load_first_block
+
+ bls .clmul_loop_end
+
+ crc64_load_p4
+
+// 1024bit --> 512bit loop
+// merge x0, x1, x2, x3, y0, y1, y2, y3 => x0, x1, x2, x3 (uint64x2_t)
+ crc_norm_loop
+
+.clmul_loop_end:
+// folding 512bit --> 128bit
+ crc64_fold_512b_to_128b
+
+// folding 128bit --> 64bit
+ mov x_tmp, p0_low_b0
+ movk x_tmp, p0_low_b1, lsl 16
+ movk x_tmp, p0_low_b2, lsl 32
+ movk x_tmp, p0_low_b3, lsl 48
+ fmov d_p0_high, x_tmp
+
+ pmull2 v_tmp_high.1q, v_x3.2d, v_p0.2d
+ movi v_tmp_low.2s, 0
+ ext v_tmp_low.16b, v_tmp_low.16b, v_x3.16b, #8
+
+ eor v_x3.16b, v_tmp_high.16b, v_tmp_low.16b
+
+// barrett reduction
+ mov x_tmp, br_low_b0
+ movk x_tmp, br_low_b1, lsl 16
+ movk x_tmp, br_low_b2, lsl 32
+ movk x_tmp, br_low_b3, lsl 48
+ fmov d_br_low2, x_tmp
+
+ mov x_tmp2, br_high_b0
+ movk x_tmp2, br_high_b1, lsl 16
+ movk x_tmp2, br_high_b2, lsl 32
+ movk x_tmp2, br_high_b3, lsl 48
+ fmov d_br_high2, x_tmp2
+
+ pmull2 v_tmp_low.1q, v_x3.2d, v_br_low.2d
+ eor v_tmp_low.16b, v_x3.16b, v_tmp_low.16b
+ pmull2 v_tmp_low.1q, v_tmp_low.2d, v_br_high.2d
+ eor v_x3.8b, v_x3.8b, v_tmp_low.8b
+ umov x_seed, v_x3.d[0]
+
+ b .crc_tab_pre
+
+ .size \name, .-\name
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 4
+.shuffle_data:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8
+ .byte 7, 6, 5, 4, 3, 2, 1, 0
+.endm
diff --git a/src/isa-l/crc/aarch64/crc64_refl_common_pmull.h b/src/isa-l/crc/aarch64/crc64_refl_common_pmull.h
new file mode 100644
index 000000000..a45501300
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc64_refl_common_pmull.h
@@ -0,0 +1,126 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "crc_common_pmull.h"
+
+.macro crc64_refl_func name:req
+ .arch armv8-a+crypto
+ .text
+ .align 3
+ .global \name
+ .type \name, %function
+
+/* uint64_t crc64_refl_func(uint64_t seed, const uint8_t * buf, uint64_t len) */
+
+\name\():
+ mvn x_seed, x_seed
+ mov x_counter, 0
+ cmp x_len, (FOLD_SIZE-1)
+ bhi .crc_clmul_pre
+
+.crc_tab_pre:
+ cmp x_len, x_counter
+ bls .done
+
+ adrp x_tmp, .lanchor_crc_tab
+ add x_buf_iter, x_buf, x_counter
+ add x_buf, x_buf, x_len
+ add x_crc_tab_addr, x_tmp, :lo12:.lanchor_crc_tab
+
+ .align 3
+.loop_crc_tab:
+ ldrb w_tmp, [x_buf_iter], 1
+ eor w_tmp, w_tmp, w0
+ cmp x_buf, x_buf_iter
+ and x_tmp, x_tmp, 255
+ ldr x_tmp, [x_crc_tab_addr, x_tmp, lsl 3]
+ eor x_seed, x_tmp, x_seed, lsr 8
+ bne .loop_crc_tab
+
+.done:
+ mvn x_crc_ret, x_seed
+ ret
+
+ .align 2
+.crc_clmul_pre:
+ fmov d_x0, x_seed // save crc to d_x0
+
+ crc_refl_load_first_block
+
+ bls .clmul_loop_end
+
+ crc64_load_p4
+
+// 1024bit --> 512bit loop
+// merge x0, x1, x2, x3, y0, y1, y2, y3 => x0, x1, x2, x3 (uint64x2_t)
+ crc_refl_loop
+
+.clmul_loop_end:
+// folding 512bit --> 128bit
+ crc64_fold_512b_to_128b
+
+// folding 128bit --> 64bit
+ mov x_tmp, p0_low_b0
+ movk x_tmp, p0_low_b1, lsl 16
+ movk x_tmp, p0_low_b2, lsl 32
+ movk x_tmp, p0_low_b3, lsl 48
+ fmov d_p0_low, x_tmp
+
+ pmull v_tmp_low.1q, v_x3.1d, v_p0.1d
+
+ mov d_tmp_high, v_x3.d[1]
+
+ eor v_x3.16b, v_tmp_high.16b, v_tmp_low.16b
+
+// barrett reduction
+ mov x_tmp, br_low_b0
+ movk x_tmp, br_low_b1, lsl 16
+ movk x_tmp, br_low_b2, lsl 32
+ movk x_tmp, br_low_b3, lsl 48
+ fmov d_br_low, x_tmp
+
+ mov x_tmp2, br_high_b0
+ movk x_tmp2, br_high_b1, lsl 16
+ movk x_tmp2, br_high_b2, lsl 32
+ movk x_tmp2, br_high_b3, lsl 48
+ fmov d_br_high, x_tmp2
+
+ pmull v_tmp_low.1q, v_x3.1d, v_br_low.1d
+ pmull v_tmp_high.1q, v_tmp_low.1d, v_br_high.1d
+
+ ext v_tmp_low.16b, v_br_low.16b, v_tmp_low.16b, #8
+
+ eor v_tmp_low.16b, v_tmp_low.16b, v_tmp_high.16b
+ eor v_tmp_low.16b, v_tmp_low.16b, v_x3.16b
+ umov x_crc_ret, v_tmp_low.d[1]
+
+ b .crc_tab_pre
+
+ .size \name, .-\name
+.endm
diff --git a/src/isa-l/crc/aarch64/crc_aarch64_dispatcher.c b/src/isa-l/crc/aarch64/crc_aarch64_dispatcher.c
new file mode 100644
index 000000000..22ea72b14
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc_aarch64_dispatcher.c
@@ -0,0 +1,166 @@
+/**********************************************************************
+ Copyright(c) 2019-2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(crc16_t10dif)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_PMULL)
+ return PROVIDER_INFO(crc16_t10dif_pmull);
+
+ return PROVIDER_BASIC(crc16_t10dif);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc16_t10dif_copy)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_PMULL)
+ return PROVIDER_INFO(crc16_t10dif_copy_pmull);
+
+ return PROVIDER_BASIC(crc16_t10dif_copy);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc32_ieee)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_PMULL) {
+ return PROVIDER_INFO(crc32_ieee_norm_pmull);
+ }
+
+ return PROVIDER_BASIC(crc32_ieee);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc32_iscsi)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_CRC32) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+ case MICRO_ARCH_ID(ARM, CORTEX_A57):
+ case MICRO_ARCH_ID(ARM, CORTEX_A72):
+ return PROVIDER_INFO(crc32_iscsi_crc_ext);
+ }
+ }
+ if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) {
+ return PROVIDER_INFO(crc32_iscsi_3crc_fold);
+ }
+
+ if (auxval & HWCAP_PMULL) {
+ return PROVIDER_INFO(crc32_iscsi_refl_pmull);
+ }
+ return PROVIDER_BASIC(crc32_iscsi);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc32_gzip_refl)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_CRC32) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+ case MICRO_ARCH_ID(ARM, CORTEX_A57):
+ case MICRO_ARCH_ID(ARM, CORTEX_A72):
+ return PROVIDER_INFO(crc32_gzip_refl_crc_ext);
+ }
+ }
+ if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) {
+ return PROVIDER_INFO(crc32_gzip_refl_3crc_fold);
+ }
+
+ if (auxval & HWCAP_PMULL)
+ return PROVIDER_INFO(crc32_gzip_refl_pmull);
+
+ return PROVIDER_BASIC(crc32_gzip_refl);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc64_ecma_refl)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_PMULL)
+ return PROVIDER_INFO(crc64_ecma_refl_pmull);
+
+ return PROVIDER_BASIC(crc64_ecma_refl);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc64_ecma_norm)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_PMULL)
+ return PROVIDER_INFO(crc64_ecma_norm_pmull);
+
+ return PROVIDER_BASIC(crc64_ecma_norm);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc64_iso_refl)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_PMULL)
+ return PROVIDER_INFO(crc64_iso_refl_pmull);
+
+ return PROVIDER_BASIC(crc64_iso_refl);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc64_iso_norm)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_PMULL)
+ return PROVIDER_INFO(crc64_iso_norm_pmull);
+
+ return PROVIDER_BASIC(crc64_iso_norm);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc64_jones_refl)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_PMULL)
+ return PROVIDER_INFO(crc64_jones_refl_pmull);
+
+ return PROVIDER_BASIC(crc64_jones_refl);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(crc64_jones_norm)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_PMULL)
+ return PROVIDER_INFO(crc64_jones_norm_pmull);
+
+ return PROVIDER_BASIC(crc64_jones_norm);
+
+}
diff --git a/src/isa-l/crc/aarch64/crc_common_pmull.h b/src/isa-l/crc/aarch64/crc_common_pmull.h
new file mode 100644
index 000000000..20a71b913
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc_common_pmull.h
@@ -0,0 +1,302 @@
+########################################################################
+# Copyright (c) 2019 Microsoft Corporation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Microsoft Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+// parameters
+#define w_seed w0
+#define x_seed x0
+#define x_buf x1
+#define w_len w2
+#define x_len x2
+
+// return
+#define w_crc_ret w0
+#define x_crc_ret x0
+
+// constant
+#define FOLD_SIZE 64
+
+// global variables
+#define x_buf_end x3
+#define w_counter w4
+#define x_counter x4
+#define x_buf_iter x5
+#define x_crc_tab_addr x6
+#define x_tmp2 x6
+#define w_tmp w7
+#define x_tmp x7
+
+#define v_x0 v0
+#define d_x0 d0
+#define s_x0 s0
+
+#define q_x1 q1
+#define v_x1 v1
+
+#define q_x2 q2
+#define v_x2 v2
+
+#define q_x3 q3
+#define v_x3 v3
+#define d_x3 d3
+#define s_x3 s3
+
+#define q_y0 q4
+#define v_y0 v4
+#define v_tmp_high v4
+#define d_tmp_high d4
+
+#define q_y1 q5
+#define v_y1 v5
+#define v_tmp_low v5
+
+#define q_y2 q6
+#define v_y2 v6
+
+#define q_y3 q7
+#define v_y3 v7
+
+#define q_x0_tmp q30
+#define v_x0_tmp v30
+#define d_p4_high v30.d[1]
+#define d_p4_low d30
+#define v_p4 v30
+#define d_p1_high v30.d[1]
+#define d_p1_low d30
+#define v_p1 v30
+#define d_p0_high v30.d[1]
+#define d_p0_low d30
+#define v_p0 v30
+#define d_br_low d30
+#define d_br_low2 v30.d[1]
+#define v_br_low v30
+
+#define q_shuffle q31
+#define v_shuffle v31
+#define d_br_high d31
+#define d_br_high2 v31.d[1]
+#define v_br_high v31
+#define d_p0_low2 d31
+#define d_p0_high2 v31.d[1]
+#define v_p02 v31
+
+#define v_x0_high v16
+#define v_x1_high v17
+#define v_x2_high v18
+#define v_x3_high v19
+
+.macro crc_refl_load_first_block
+ ldr q_x0_tmp, [x_buf]
+ ldr q_x1, [x_buf, 16]
+ ldr q_x2, [x_buf, 32]
+ ldr q_x3, [x_buf, 48]
+
+ and x_counter, x_len, -64
+ sub x_tmp, x_counter, #64
+ cmp x_tmp, 63
+
+ add x_buf_iter, x_buf, 64
+
+ eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
+.endm
+
+.macro crc_norm_load_first_block
+ adrp x_tmp, .shuffle_data
+ ldr q_shuffle, [x_tmp, #:lo12:.shuffle_data]
+
+ ldr q_x0_tmp, [x_buf]
+ ldr q_x1, [x_buf, 16]
+ ldr q_x2, [x_buf, 32]
+ ldr q_x3, [x_buf, 48]
+
+ and x_counter, x_len, -64
+ sub x_tmp, x_counter, #64
+ cmp x_tmp, 63
+
+ add x_buf_iter, x_buf, 64
+
+ tbl v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b
+ tbl v_x1.16b, {v_x1.16b}, v_shuffle.16b
+ tbl v_x2.16b, {v_x2.16b}, v_shuffle.16b
+ tbl v_x3.16b, {v_x3.16b}, v_shuffle.16b
+
+ eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
+.endm
+
+.macro crc32_load_p4
+ add x_buf_end, x_buf_iter, x_tmp
+
+ mov x_tmp, p4_low_b0
+ movk x_tmp, p4_low_b1, lsl 16
+ fmov d_p4_low, x_tmp
+
+ mov x_tmp2, p4_high_b0
+ movk x_tmp2, p4_high_b1, lsl 16
+ fmov d_p4_high, x_tmp2
+.endm
+
+.macro crc64_load_p4
+ add x_buf_end, x_buf_iter, x_tmp
+
+ mov x_tmp, p4_low_b0
+ movk x_tmp, p4_low_b1, lsl 16
+ movk x_tmp, p4_low_b2, lsl 32
+ movk x_tmp, p4_low_b3, lsl 48
+ fmov d_p4_low, x_tmp
+
+ mov x_tmp2, p4_high_b0
+ movk x_tmp2, p4_high_b1, lsl 16
+ movk x_tmp2, p4_high_b2, lsl 32
+ movk x_tmp2, p4_high_b3, lsl 48
+ fmov d_p4_high, x_tmp2
+.endm
+
+.macro crc_refl_loop
+ .align 3
+.clmul_loop:
+ // interleave ldr and pmull(2) for arch which can only issue quadword load every
+ // other cycle (i.e. A55)
+ ldr q_y0, [x_buf_iter]
+ pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
+ ldr q_y1, [x_buf_iter, 16]
+ pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
+ ldr q_y2, [x_buf_iter, 32]
+ pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
+ ldr q_y3, [x_buf_iter, 48]
+ pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
+
+ pmull v_x0.1q, v_x0.1d, v_p4.1d
+ add x_buf_iter, x_buf_iter, 64
+ pmull v_x1.1q, v_x1.1d, v_p4.1d
+ cmp x_buf_iter, x_buf_end
+ pmull v_x2.1q, v_x2.1d, v_p4.1d
+ pmull v_x3.1q, v_x3.1d, v_p4.1d
+
+ eor v_x0.16b, v_x0.16b, v_x0_high.16b
+ eor v_x1.16b, v_x1.16b, v_x1_high.16b
+ eor v_x2.16b, v_x2.16b, v_x2_high.16b
+ eor v_x3.16b, v_x3.16b, v_x3_high.16b
+
+ eor v_x0.16b, v_x0.16b, v_y0.16b
+ eor v_x1.16b, v_x1.16b, v_y1.16b
+ eor v_x2.16b, v_x2.16b, v_y2.16b
+ eor v_x3.16b, v_x3.16b, v_y3.16b
+ bne .clmul_loop
+.endm
+
+.macro crc_norm_loop
+ .align 3
+.clmul_loop:
+ // interleave ldr and pmull(2) for arch which can only issue quadword load every
+ // other cycle (i.e. A55)
+ ldr q_y0, [x_buf_iter]
+ pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
+ ldr q_y1, [x_buf_iter, 16]
+ pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
+ ldr q_y2, [x_buf_iter, 32]
+ pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
+ ldr q_y3, [x_buf_iter, 48]
+ pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
+
+ pmull v_x0.1q, v_x0.1d, v_p4.1d
+ add x_buf_iter, x_buf_iter, 64
+ pmull v_x1.1q, v_x1.1d, v_p4.1d
+ cmp x_buf_iter, x_buf_end
+ pmull v_x2.1q, v_x2.1d, v_p4.1d
+ pmull v_x3.1q, v_x3.1d, v_p4.1d
+
+ tbl v_y0.16b, {v_y0.16b}, v_shuffle.16b
+ tbl v_y1.16b, {v_y1.16b}, v_shuffle.16b
+ tbl v_y2.16b, {v_y2.16b}, v_shuffle.16b
+ tbl v_y3.16b, {v_y3.16b}, v_shuffle.16b
+
+ eor v_x0.16b, v_x0.16b, v_x0_high.16b
+ eor v_x1.16b, v_x1.16b, v_x1_high.16b
+ eor v_x2.16b, v_x2.16b, v_x2_high.16b
+ eor v_x3.16b, v_x3.16b, v_x3_high.16b
+
+ eor v_x0.16b, v_x0.16b, v_y0.16b
+ eor v_x1.16b, v_x1.16b, v_y1.16b
+ eor v_x2.16b, v_x2.16b, v_y2.16b
+ eor v_x3.16b, v_x3.16b, v_y3.16b
+ bne .clmul_loop
+.endm
+
+.macro crc32_fold_512b_to_128b
+ mov x_tmp, p1_low_b0
+ movk x_tmp, p1_low_b1, lsl 16
+ fmov d_p1_low, x_tmp
+
+ mov x_tmp2, p1_high_b0
+ movk x_tmp2, p1_high_b1, lsl 16
+ fmov d_p1_high, x_tmp2
+
+ pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
+ pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
+ eor v_x1.16b, v_x1.16b, v_tmp_high.16b
+ eor v_x1.16b, v_x1.16b, v_tmp_low.16b
+
+ pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
+ pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
+ eor v_x2.16b, v_x2.16b, v_tmp_high.16b
+ eor v_x2.16b, v_x2.16b, v_tmp_low.16b
+
+ pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
+ pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
+ eor v_x3.16b, v_x3.16b, v_tmp_high.16b
+ eor v_x3.16b, v_x3.16b, v_tmp_low.16b
+.endm
+
+.macro crc64_fold_512b_to_128b
+ mov x_tmp, p1_low_b0
+ movk x_tmp, p1_low_b1, lsl 16
+ movk x_tmp, p1_low_b2, lsl 32
+ movk x_tmp, p1_low_b3, lsl 48
+ fmov d_p1_low, x_tmp
+
+ mov x_tmp2, p1_high_b0
+ movk x_tmp2, p1_high_b1, lsl 16
+ movk x_tmp2, p1_high_b2, lsl 32
+ movk x_tmp2, p1_high_b3, lsl 48
+ fmov d_p1_high, x_tmp2
+
+ pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
+ pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
+ eor v_x1.16b, v_x1.16b, v_tmp_high.16b
+ eor v_x1.16b, v_x1.16b, v_tmp_low.16b
+
+ pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
+ pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
+ eor v_x2.16b, v_x2.16b, v_tmp_high.16b
+ eor v_x2.16b, v_x2.16b, v_tmp_low.16b
+
+ pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
+ pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
+ eor v_x3.16b, v_x3.16b, v_tmp_high.16b
+ eor v_x3.16b, v_x3.16b, v_tmp_low.16b
+.endm \ No newline at end of file
diff --git a/src/isa-l/crc/aarch64/crc_multibinary_arm.S b/src/isa-l/crc/aarch64/crc_multibinary_arm.S
new file mode 100644
index 000000000..76f957164
--- /dev/null
+++ b/src/isa-l/crc/aarch64/crc_multibinary_arm.S
@@ -0,0 +1,42 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+#include <aarch64_multibinary.h>
+
+
+mbin_interface crc32_iscsi
+mbin_interface crc16_t10dif
+mbin_interface crc16_t10dif_copy
+mbin_interface crc32_ieee
+mbin_interface crc32_gzip_refl
+mbin_interface crc64_ecma_refl
+mbin_interface crc64_ecma_norm
+mbin_interface crc64_iso_refl
+mbin_interface crc64_iso_norm
+mbin_interface crc64_jones_refl
+mbin_interface crc64_jones_norm
diff --git a/src/isa-l/crc/crc16_t10dif_01.asm b/src/isa-l/crc/crc16_t10dif_01.asm
new file mode 100644
index 000000000..536b6f38d
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_01.asm
@@ -0,0 +1,666 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; UINT16 crc16_t10dif_01(
+; UINT16 init_crc, //initial CRC value, 16 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global crc16_t10dif_01, function
+crc16_t10dif_01:
+ endbranch
+
+ ; adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl arg1_low32, 16
+
+ ; After this point, code flow is exactly same as a 32-bit CRC.
+ ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
+
+ sub rsp, VARIABLE_OFFSET
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp+16*2],xmm6
+ movdqa [rsp+16*3],xmm7
+ movdqa [rsp+16*4],xmm8
+ movdqa [rsp+16*5],xmm9
+ movdqa [rsp+16*6],xmm10
+ movdqa [rsp+16*7],xmm11
+ movdqa [rsp+16*8],xmm12
+ movdqa [rsp+16*9],xmm13
+%endif
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movd xmm10, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 12
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x1
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm7
+
+ pand xmm0, [mask2]
+
+ psrldq xmm7, 12
+ pclmulqdq xmm7, xmm10, 0x10
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+ pclmulqdq xmm7, xmm10, 0x01
+ pslldq xmm7, 4
+ pclmulqdq xmm7, xmm10, 0x11
+
+ pslldq xmm7, 4
+ pxor xmm7, xmm0
+ pextrd eax, xmm7,1
+
+_cleanup:
+ ; scale the result back to 16 bits
+ shr eax, 16
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp+16*2]
+ movdqa xmm7, [rsp+16*3]
+ movdqa xmm8, [rsp+16*4]
+ movdqa xmm9, [rsp+16*5]
+ movdqa xmm10, [rsp+16*6]
+ movdqa xmm11, [rsp+16*7]
+ movdqa xmm12, [rsp+16*8]
+ movdqa xmm13, [rsp+16*9]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
+align 16
+; Q = 0x18BB70000
+; rk1 = 2^(32*3) mod Q << 32
+; rk2 = 2^(32*5) mod Q << 32
+; rk3 = 2^(32*15) mod Q << 32
+; rk4 = 2^(32*17) mod Q << 32
+; rk5 = 2^(32*3) mod Q << 32
+; rk6 = 2^(32*2) mod Q << 32
+; rk7 = floor(2^64/Q)
+; rk8 = Q
+rk1:
+DQ 0x2d56000000000000
+rk2:
+DQ 0x06df000000000000
+rk3:
+DQ 0x9d9d000000000000
+rk4:
+DQ 0x7cf5000000000000
+rk5:
+DQ 0x2d56000000000000
+rk6:
+DQ 0x1368000000000000
+rk7:
+DQ 0x00000001f65a57f8
+rk8:
+DQ 0x000000018bb70000
+
+rk9:
+DQ 0xceae000000000000
+rk10:
+DQ 0xbfd6000000000000
+rk11:
+DQ 0x1e16000000000000
+rk12:
+DQ 0x713c000000000000
+rk13:
+DQ 0xf7f9000000000000
+rk14:
+DQ 0x80a6000000000000
+rk15:
+DQ 0x044c000000000000
+rk16:
+DQ 0xe658000000000000
+rk17:
+DQ 0xad18000000000000
+rk18:
+DQ 0xa497000000000000
+rk19:
+DQ 0x6ee3000000000000
+rk20:
+DQ 0xe7b5000000000000
+
+
+
+
+
+
+
+
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+;;; func core, ver, snum
+slversion crc16_t10dif_01, 01, 06, 0010
+
diff --git a/src/isa-l/crc/crc16_t10dif_02.asm b/src/isa-l/crc/crc16_t10dif_02.asm
new file mode 100644
index 000000000..0e392afb1
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_02.asm
@@ -0,0 +1,654 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; UINT16 crc16_t10dif_02(
+; UINT16 init_crc, //initial CRC value, 16 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global crc16_t10dif_02, function
+crc16_t10dif_02:
+ endbranch
+
+ ; adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl arg1_low32, 16
+
+ ; After this point, code flow is exactly same as a 32-bit CRC.
+ ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
+
+ sub rsp, VARIABLE_OFFSET
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ vmovdqa [rsp+16*2],xmm6
+ vmovdqa [rsp+16*3],xmm7
+ vmovdqa [rsp+16*4],xmm8
+ vmovdqa [rsp+16*5],xmm9
+ vmovdqa [rsp+16*6],xmm10
+ vmovdqa [rsp+16*7],xmm11
+ vmovdqa [rsp+16*8],xmm12
+ vmovdqa [rsp+16*9],xmm13
+%endif
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ vmovd xmm10, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ vpslldq xmm10, 12
+
+ vmovdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ vmovdqu xmm0, [arg2+16*0]
+ vmovdqu xmm1, [arg2+16*1]
+ vmovdqu xmm2, [arg2+16*2]
+ vmovdqu xmm3, [arg2+16*3]
+ vmovdqu xmm4, [arg2+16*4]
+ vmovdqu xmm5, [arg2+16*5]
+ vmovdqu xmm6, [arg2+16*6]
+ vmovdqu xmm7, [arg2+16*7]
+
+ vpshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ vpxor xmm0, xmm10
+ vpshufb xmm1, xmm11
+ vpshufb xmm2, xmm11
+ vpshufb xmm3, xmm11
+ vpshufb xmm4, xmm11
+ vpshufb xmm5, xmm11
+ vpshufb xmm6, xmm11
+ vpshufb xmm7, xmm11
+
+ vmovdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ vmovdqu xmm9, [arg2+16*0]
+ vmovdqu xmm12, [arg2+16*1]
+ vpshufb xmm9, xmm11
+ vpshufb xmm12, xmm11
+ vmovdqa xmm8, xmm0
+ vmovdqa xmm13, xmm1
+ vpclmulqdq xmm0, xmm10, 0x0
+ vpclmulqdq xmm8, xmm10 , 0x11
+ vpclmulqdq xmm1, xmm10, 0x0
+ vpclmulqdq xmm13, xmm10 , 0x11
+ vpxor xmm0, xmm9
+ vxorps xmm0, xmm8
+ vpxor xmm1, xmm12
+ vxorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ vmovdqu xmm9, [arg2+16*2]
+ vmovdqu xmm12, [arg2+16*3]
+ vpshufb xmm9, xmm11
+ vpshufb xmm12, xmm11
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm13, xmm3
+ vpclmulqdq xmm2, xmm10, 0x0
+ vpclmulqdq xmm8, xmm10 , 0x11
+ vpclmulqdq xmm3, xmm10, 0x0
+ vpclmulqdq xmm13, xmm10 , 0x11
+ vpxor xmm2, xmm9
+ vxorps xmm2, xmm8
+ vpxor xmm3, xmm12
+ vxorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ vmovdqu xmm9, [arg2+16*4]
+ vmovdqu xmm12, [arg2+16*5]
+ vpshufb xmm9, xmm11
+ vpshufb xmm12, xmm11
+ vmovdqa xmm8, xmm4
+ vmovdqa xmm13, xmm5
+ vpclmulqdq xmm4, xmm10, 0x0
+ vpclmulqdq xmm8, xmm10 , 0x11
+ vpclmulqdq xmm5, xmm10, 0x0
+ vpclmulqdq xmm13, xmm10 , 0x11
+ vpxor xmm4, xmm9
+ vxorps xmm4, xmm8
+ vpxor xmm5, xmm12
+ vxorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ vmovdqu xmm9, [arg2+16*6]
+ vmovdqu xmm12, [arg2+16*7]
+ vpshufb xmm9, xmm11
+ vpshufb xmm12, xmm11
+ vmovdqa xmm8, xmm6
+ vmovdqa xmm13, xmm7
+ vpclmulqdq xmm6, xmm10, 0x0
+ vpclmulqdq xmm8, xmm10 , 0x11
+ vpclmulqdq xmm7, xmm10, 0x0
+ vpclmulqdq xmm13, xmm10 , 0x11
+ vpxor xmm6, xmm9
+ vxorps xmm6, xmm8
+ vpxor xmm7, xmm12
+ vxorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ vmovdqa xmm10, [rk9]
+ vmovdqa xmm8, xmm0
+ vpclmulqdq xmm0, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm0
+
+ vmovdqa xmm10, [rk11]
+ vmovdqa xmm8, xmm1
+ vpclmulqdq xmm1, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm1
+
+ vmovdqa xmm10, [rk13]
+ vmovdqa xmm8, xmm2
+ vpclmulqdq xmm2, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+ vmovdqa xmm10, [rk15]
+ vmovdqa xmm8, xmm3
+ vpclmulqdq xmm3, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm3
+
+ vmovdqa xmm10, [rk17]
+ vmovdqa xmm8, xmm4
+ vpclmulqdq xmm4, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm4
+
+ vmovdqa xmm10, [rk19]
+ vmovdqa xmm8, xmm5
+ vpclmulqdq xmm5, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm5
+
+ vmovdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ vmovdqa xmm8, xmm6
+ vpclmulqdq xmm6, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ vmovdqa xmm8, xmm7
+ vpclmulqdq xmm7, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vmovdqu xmm0, [arg2]
+ vpshufb xmm0, xmm11
+ vpxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ vmovdqa xmm2, xmm7
+
+ vmovdqu xmm1, [arg2 - 16 + arg3]
+ vpshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ vmovdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ vpshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ vpxor xmm0, [mask1]
+ vpshufb xmm7, xmm0
+ vpblendvb xmm1, xmm1, xmm2, xmm0
+
+ ; fold 16 Bytes
+ vmovdqa xmm2, xmm1
+ vmovdqa xmm8, xmm7
+ vpclmulqdq xmm7, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ vmovdqa xmm0, xmm7
+
+ ;64b fold
+ vpclmulqdq xmm7, xmm10, 0x1
+ vpslldq xmm0, 8
+ vpxor xmm7, xmm0
+
+ ;32b fold
+ vmovdqa xmm0, xmm7
+
+ vpand xmm0, [mask2]
+
+ vpsrldq xmm7, 12
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ vmovdqa xmm0, xmm7
+ vpclmulqdq xmm7, xmm10, 0x01
+ vpslldq xmm7, 4
+ vpclmulqdq xmm7, xmm10, 0x11
+
+ vpslldq xmm7, 4
+ vpxor xmm7, xmm0
+ vpextrd eax, xmm7,1
+
+_cleanup:
+ ; scale the result back to 16 bits
+ shr eax, 16
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp+16*2]
+ vmovdqa xmm7, [rsp+16*3]
+ vmovdqa xmm8, [rsp+16*4]
+ vmovdqa xmm9, [rsp+16*5]
+ vmovdqa xmm10, [rsp+16*6]
+ vmovdqa xmm11, [rsp+16*7]
+ vmovdqa xmm12, [rsp+16*8]
+ vmovdqa xmm13, [rsp+16*9]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ vmovdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm11 ; byte-reflect the plaintext
+ vpxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ vmovdqa xmm11, [SHUF_MASK]
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm11 ; byte-reflect the plaintext
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ vmovdqu xmm0, [rax]
+ vpxor xmm0, [mask1]
+
+ vpshufb xmm7, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ vmovdqu xmm7, [arg2]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
+align 16
+; Q = 0x18BB70000
+; rk1 = 2^(32*3) mod Q << 32
+; rk2 = 2^(32*5) mod Q << 32
+; rk3 = 2^(32*15) mod Q << 32
+; rk4 = 2^(32*17) mod Q << 32
+; rk5 = 2^(32*3) mod Q << 32
+; rk6 = 2^(32*2) mod Q << 32
+; rk7 = floor(2^64/Q)
+; rk8 = Q
+rk1:
+DQ 0x2d56000000000000
+rk2:
+DQ 0x06df000000000000
+rk3:
+DQ 0x9d9d000000000000
+rk4:
+DQ 0x7cf5000000000000
+rk5:
+DQ 0x2d56000000000000
+rk6:
+DQ 0x1368000000000000
+rk7:
+DQ 0x00000001f65a57f8
+rk8:
+DQ 0x000000018bb70000
+
+rk9:
+DQ 0xceae000000000000
+rk10:
+DQ 0xbfd6000000000000
+rk11:
+DQ 0x1e16000000000000
+rk12:
+DQ 0x713c000000000000
+rk13:
+DQ 0xf7f9000000000000
+rk14:
+DQ 0x80a6000000000000
+rk15:
+DQ 0x044c000000000000
+rk16:
+DQ 0xe658000000000000
+rk17:
+DQ 0xad18000000000000
+rk18:
+DQ 0xa497000000000000
+rk19:
+DQ 0x6ee3000000000000
+rk20:
+DQ 0xe7b5000000000000
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
diff --git a/src/isa-l/crc/crc16_t10dif_by16_10.asm b/src/isa-l/crc/crc16_t10dif_by16_10.asm
new file mode 100644
index 000000000..27a2e02a0
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_by16_10.asm
@@ -0,0 +1,591 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; UINT32 crc16_t10dif_by16_10(
+; UINT16 init_crc, //initial CRC value, 16 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+;
+;
+
+%include "reg_sizes.asm"
+
+%ifndef FUNCTION_NAME
+%define FUNCTION_NAME crc16_t10dif_by16_10
+%endif
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*12+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global FUNCTION_NAME, function
+FUNCTION_NAME:
+ endbranch
+
+ ; adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl arg1_low32, 16
+
+ ; After this point, code flow is exactly same as a 32-bit CRC.
+ ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
+
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
+%endif
+
+ vbroadcasti32x4 zmm18, [SHUF_MASK]
+ cmp arg3, 256
+ jl .less_than_256
+
+ ; load the initial crc value
+ vmovd xmm10, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ vpslldq xmm10, 12
+
+ ; receive the initial 64B data, xor the initial crc value
+ vmovdqu8 zmm0, [arg2+16*0]
+ vmovdqu8 zmm4, [arg2+16*4]
+ vpshufb zmm0, zmm0, zmm18
+ vpshufb zmm4, zmm4, zmm18
+ vpxorq zmm0, zmm10
+ vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+
+ sub arg3, 256
+ cmp arg3, 256
+ jl .fold_128_B_loop
+
+ vmovdqu8 zmm7, [arg2+16*8]
+ vmovdqu8 zmm8, [arg2+16*12]
+ vpshufb zmm7, zmm7, zmm18
+ vpshufb zmm8, zmm8, zmm18
+ vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
+ sub arg3, 256
+
+.fold_256_B_loop:
+ add arg2, 256
+ vmovdqu8 zmm3, [arg2+16*0]
+ vpshufb zmm3, zmm3, zmm18
+ vpclmulqdq zmm1, zmm0, zmm16, 0x00
+ vpclmulqdq zmm2, zmm0, zmm16, 0x11
+ vpxorq zmm0, zmm1, zmm2
+ vpxorq zmm0, zmm0, zmm3
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpshufb zmm9, zmm9, zmm18
+ vpclmulqdq zmm5, zmm4, zmm16, 0x00
+ vpclmulqdq zmm6, zmm4, zmm16, 0x11
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ vmovdqu8 zmm11, [arg2+16*8]
+ vpshufb zmm11, zmm11, zmm18
+ vpclmulqdq zmm12, zmm7, zmm16, 0x00
+ vpclmulqdq zmm13, zmm7, zmm16, 0x11
+ vpxorq zmm7, zmm12, zmm13
+ vpxorq zmm7, zmm7, zmm11
+
+ vmovdqu8 zmm17, [arg2+16*12]
+ vpshufb zmm17, zmm17, zmm18
+ vpclmulqdq zmm14, zmm8, zmm16, 0x00
+ vpclmulqdq zmm15, zmm8, zmm16, 0x11
+ vpxorq zmm8, zmm14, zmm15
+ vpxorq zmm8, zmm8, zmm17
+
+ sub arg3, 256
+ jge .fold_256_B_loop
+
+ ;; Fold 256 into 128
+ add arg2, 256
+ vpclmulqdq zmm1, zmm0, zmm10, 0x00
+ vpclmulqdq zmm2, zmm0, zmm10, 0x11
+ vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
+
+ vpclmulqdq zmm5, zmm4, zmm10, 0x00
+ vpclmulqdq zmm6, zmm4, zmm10, 0x11
+ vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
+
+ vmovdqa32 zmm0, zmm7
+ vmovdqa32 zmm4, zmm8
+
+ add arg3, 128
+ jmp .fold_128_B_register
+
+
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+.fold_128_B_loop:
+ add arg2, 128
+ vmovdqu8 zmm8, [arg2+16*0]
+ vpshufb zmm8, zmm8, zmm18
+ vpclmulqdq zmm2, zmm0, zmm10, 0x00
+ vpclmulqdq zmm1, zmm0, zmm10, 0x11
+ vpxorq zmm0, zmm2, zmm1
+ vpxorq zmm0, zmm0, zmm8
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpshufb zmm9, zmm9, zmm18
+ vpclmulqdq zmm5, zmm4, zmm10, 0x00
+ vpclmulqdq zmm6, zmm4, zmm10, 0x11
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ sub arg3, 128
+ jge .fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+.fold_128_B_register:
+ ; fold the 8 128b parts into 1 xmm register with different constants
+ vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
+ vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
+ vpclmulqdq zmm1, zmm0, zmm16, 0x00
+ vpclmulqdq zmm2, zmm0, zmm16, 0x11
+ vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
+
+ vpclmulqdq zmm5, zmm4, zmm11, 0x00
+ vpclmulqdq zmm6, zmm4, zmm11, 0x11
+ vmovdqa xmm10, [rk1] ; Needed later in reduction loop
+ vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
+ vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
+
+ vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
+ vpxorq ymm8, ymm8, ymm1
+ vextracti64x2 xmm5, ymm8, 1
+ vpxorq xmm7, xmm5, xmm8
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl .final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+.16B_reduction_loop:
+ vpclmulqdq xmm8, xmm7, xmm10, 0x11
+ vpclmulqdq xmm7, xmm7, xmm10, 0x00
+ vpxor xmm7, xmm8
+ vmovdqu xmm0, [arg2]
+ vpshufb xmm0, xmm0, xmm18
+ vpxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge .16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+.final_reduction_for_128:
+ add arg3, 16
+ je .128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset
+ ; the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+.get_last_two_xmms:
+
+ vmovdqa xmm2, xmm7
+ vmovdqu xmm1, [arg2 - 16 + arg3]
+ vpshufb xmm1, xmm18
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ vmovdqu xmm0, [rax]
+
+ vpshufb xmm2, xmm0
+ vpxor xmm0, [mask1]
+ vpshufb xmm7, xmm0
+ vpblendvb xmm1, xmm1, xmm2, xmm0
+
+ vpclmulqdq xmm8, xmm7, xmm10, 0x11
+ vpclmulqdq xmm7, xmm7, xmm10, 0x00
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm1
+
+.128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm10, [rk5]
+ vmovdqa xmm0, xmm7
+
+ ;64b fold
+ vpclmulqdq xmm7, xmm10, 0x01 ; H*L
+ vpslldq xmm0, 8
+ vpxor xmm7, xmm0
+
+ ;32b fold
+ vmovdqa xmm0, xmm7
+ vpand xmm0, [mask2]
+ vpsrldq xmm7, 12
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm0
+
+ ;barrett reduction
+.barrett:
+ vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ vmovdqa xmm0, xmm7
+ vpclmulqdq xmm7, xmm10, 0x01
+ vpslldq xmm7, 4
+ vpclmulqdq xmm7, xmm10, 0x11
+
+ vpslldq xmm7, 4
+ vpxor xmm7, xmm0
+ vpextrd eax, xmm7, 1
+
+.cleanup:
+ ; scale the result back to 16 bits
+ shr eax, 16
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+.less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl .less_than_32
+
+ ; if there is, load the constants
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm18 ; byte-reflect the plaintext
+ vpxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp .16B_reduction_loop
+
+
+align 16
+.less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je .cleanup
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je .exact_16_left
+ jl .less_than_16_left
+
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp .get_last_two_xmms
+
+align 16
+.less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl .only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl .less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+.less_than_8_left:
+
+ cmp arg3, 4
+ jl .less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+.less_than_4_left:
+
+ cmp arg3, 2
+ jl .less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+.less_than_2_left:
+ cmp arg3, 1
+ jl .zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+.zero_left:
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ vmovdqu xmm0, [rax]
+ vpxor xmm0, [mask1]
+
+ vpshufb xmm7,xmm0
+ jmp .128_done
+
+align 16
+.exact_16_left:
+ vmovdqu xmm7, [arg2]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ jmp .128_done
+
+.only_less_than_4:
+ cmp arg3, 3
+ jl .only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 5
+ jmp .barrett
+
+.only_less_than_3:
+ cmp arg3, 2
+ jl .only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 6
+ jmp .barrett
+
+.only_less_than_2:
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 7
+ jmp .barrett
+
+section .data
+align 32
+
+%ifndef USE_CONSTS
+; precomputed constants
+
+rk_1: dq 0xdccf000000000000
+rk_2: dq 0x4b0b000000000000
+rk1: dq 0x2d56000000000000
+rk2: dq 0x06df000000000000
+rk3: dq 0x9d9d000000000000
+rk4: dq 0x7cf5000000000000
+rk5: dq 0x2d56000000000000
+rk6: dq 0x1368000000000000
+rk7: dq 0x00000001f65a57f8
+rk8: dq 0x000000018bb70000
+rk9: dq 0xceae000000000000
+rk10: dq 0xbfd6000000000000
+rk11: dq 0x1e16000000000000
+rk12: dq 0x713c000000000000
+rk13: dq 0xf7f9000000000000
+rk14: dq 0x80a6000000000000
+rk15: dq 0x044c000000000000
+rk16: dq 0xe658000000000000
+rk17: dq 0xad18000000000000
+rk18: dq 0xa497000000000000
+rk19: dq 0x6ee3000000000000
+rk20: dq 0xe7b5000000000000
+
+rk_1b: dq 0x2d56000000000000
+rk_2b: dq 0x06df000000000000
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+%else
+INCLUDE_CONSTS
+%endif
+
+mask1: dq 0x8080808080808080, 0x8080808080808080
+mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x8080808080808080
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_ %+ FUNCTION_NAME
+no_ %+ FUNCTION_NAME %+ :
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/isa-l/crc/crc16_t10dif_by4.asm b/src/isa-l/crc/crc16_t10dif_by4.asm
new file mode 100644
index 000000000..1326eb2f5
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_by4.asm
@@ -0,0 +1,563 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Function API:
+; UINT16 crc16_t10dif_by4(
+; UINT16 init_crc, //initial CRC value, 16 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://download.intel.com/design/intarch/papers/323102.pdf
+;
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+align 16
+mk_global crc16_t10dif_by4, function
+crc16_t10dif_by4:
+ endbranch
+
+ ; adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl arg1_low32, 16
+
+ ; After this point, code flow is exactly same as a 32-bit CRC.
+ ; The only difference is before returning eax, we will shift
+ ; it right 16 bits, to scale back to 16 bits.
+
+ sub rsp,16*4+8
+
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp+16*2],xmm6
+ movdqa [rsp+16*3],xmm7
+
+ ; check if smaller than 128B
+ cmp arg3, 128
+
+ ; for sizes less than 128, we can't fold 64B at a time...
+ jl _less_than_128
+
+
+ ; load the initial crc value
+ movd xmm6, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to
+ ; be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with
+ ; initial crc at correct place.
+ pslldq xmm6, 12
+
+ movdqa xmm7, [SHUF_MASK]
+ ; receive the initial 64B data, xor the initial crc value
+ movdqu xmm0, [arg2]
+ movdqu xmm1, [arg2+16]
+ movdqu xmm2, [arg2+32]
+ movdqu xmm3, [arg2+48]
+
+ pshufb xmm0, xmm7
+ ; XOR the initial_crc value
+ pxor xmm0, xmm6
+ pshufb xmm1, xmm7
+ pshufb xmm2, xmm7
+ pshufb xmm3, xmm7
+
+ movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
+ ;imm value of pclmulqdq instruction
+ ;will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 128 instead of 64 to save one instruction from the loop
+ sub arg3, 128
+
+ ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ ; buffer. The _fold_64_B_loop
+ ; loop will fold 64B at a time until we have 64+y Bytes of buffer
+
+
+ ; fold 64B at a time. This section of the code folds 4 xmm
+ ; registers in parallel
+_fold_64_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 64 ; buf += 64;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm4, xmm0
+ movdqu xmm5, xmm1
+
+ pclmulqdq xmm0, xmm6 , 0x11
+ pclmulqdq xmm1, xmm6 , 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm4, xmm2
+ movdqu xmm5, xmm3
+
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ movdqu xmm4, [arg2]
+ movdqu xmm5, [arg2+16]
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ movdqu xmm4, [arg2+32]
+ movdqu xmm5, [arg2+48]
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ sub arg3, 64
+
+ ; check if there is another 64B in the buffer to be able to fold
+ jge _fold_64_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 64
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
+
+
+ ; fold the 4 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will
+ ;determine which constant to use
+
+ movdqa xmm4, xmm0
+ pclmulqdq xmm0, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm1, xmm4
+ pxor xmm1, xmm0
+
+ movdqa xmm4, xmm1
+ pclmulqdq xmm1, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm2, xmm4
+ pxor xmm2, xmm1
+
+ movdqa xmm4, xmm2
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+
+ ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 64-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes
+ ; is in register xmm3 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm7
+ pxor xmm3, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm3 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer,
+ ; we can offset the input pointer before the actual point,
+ ; to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm3
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm7
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm3 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
+ movdqa xmm0, xmm3
+
+ ;64b fold
+ pclmulqdq xmm3, xmm6, 0x1
+ pslldq xmm0, 8
+ pxor xmm3, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm3
+
+ pand xmm0, [mask2]
+
+ psrldq xmm3, 12
+ pclmulqdq xmm3, xmm6, 0x10
+ pxor xmm3, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
+ movdqa xmm0, xmm3
+ pclmulqdq xmm3, xmm6, 0x01
+ pslldq xmm3, 4
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pslldq xmm3, 4
+ pxor xmm3, xmm0
+ pextrd eax, xmm3,1
+
+_cleanup:
+ ; scale the result back to 16 bits
+ shr eax, 16
+ movdqa xmm6, [rsp+16*2]
+ movdqa xmm7, [rsp+16*3]
+ add rsp,16*4+8
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_128:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm7, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+ movdqu xmm3, [arg2] ; load the plaintext
+ pshufb xmm3, xmm7 ; byte-reflect the plaintext
+ pxor xmm3, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm7, [SHUF_MASK]
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm3, [arg2] ; load the plaintext
+ pshufb xmm3, xmm7 ; byte-reflect the plaintext
+ pxor xmm3, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm3, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm3, [arg2]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ psrldq xmm3, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ psrldq xmm3, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ psrldq xmm3, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
+align 16
+; Q = 0x18BB70000
+; rk1 = 2^(32*3) mod Q << 32
+; rk2 = 2^(32*5) mod Q << 32
+; rk3 = 2^(32*15) mod Q << 32
+; rk4 = 2^(32*17) mod Q << 32
+; rk5 = 2^(32*3) mod Q << 32
+; rk6 = 2^(32*2) mod Q << 32
+; rk7 = floor(2^64/Q)
+; rk8 = Q
+rk1:
+DQ 0x2d56000000000000
+rk2:
+DQ 0x06df000000000000
+rk3:
+DQ 0x044c000000000000
+rk4:
+DQ 0xe658000000000000
+rk5:
+DQ 0x2d56000000000000
+rk6:
+DQ 0x1368000000000000
+rk7:
+DQ 0x00000001f65a57f8
+rk8:
+DQ 0x000000018bb70000
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+;;; func core, ver, snum
+slversion crc16_t10dif_by4, 05, 02, 0016
diff --git a/src/isa-l/crc/crc16_t10dif_copy_by4.asm b/src/isa-l/crc/crc16_t10dif_copy_by4.asm
new file mode 100644
index 000000000..b8a6838b4
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_copy_by4.asm
@@ -0,0 +1,599 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Function API:
+; UINT16 crc16_t10dif_copy_by4(
+; UINT16 init_crc, //initial CRC value, 16 bits
+; unsigned char *dst, //buffer pointer destination for copy
+; const unsigned char *src, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://download.intel.com/design/intarch/papers/323102.pdf
+;
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+ %xdefine arg4 r9
+ %xdefine tmp1 r10
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+ %xdefine arg4 rcx
+ %xdefine tmp1 r10
+ %xdefine arg1_low32 edi
+%endif
+
+align 16
+mk_global crc16_t10dif_copy_by4, function
+crc16_t10dif_copy_by4:
+ endbranch
+
+ ; adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl arg1_low32, 16
+
+ ; After this point, code flow is exactly same as a 32-bit CRC.
+ ; The only difference is before returning eax, we will shift
+ ; it right 16 bits, to scale back to 16 bits.
+
+ sub rsp,16*4+8
+
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp+16*2],xmm6
+ movdqa [rsp+16*3],xmm7
+
+ ; check if smaller than 128B
+ cmp arg4, 128
+
+ ; for sizes less than 128, we can't fold 64B at a time...
+ jl _less_than_128
+
+
+ ; load the initial crc value
+ movd xmm6, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to
+ ; be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with
+ ; initial crc at correct place.
+ pslldq xmm6, 12
+
+ movdqa xmm7, [SHUF_MASK]
+ ; receive the initial 64B data, xor the initial crc value
+ movdqu xmm0, [arg3]
+ movdqu xmm1, [arg3+16]
+ movdqu xmm2, [arg3+32]
+ movdqu xmm3, [arg3+48]
+
+ ; copy initial data
+ movdqu [arg2], xmm0
+ movdqu [arg2+16], xmm1
+ movdqu [arg2+32], xmm2
+ movdqu [arg2+48], xmm3
+
+ pshufb xmm0, xmm7
+ ; XOR the initial_crc value
+ pxor xmm0, xmm6
+ pshufb xmm1, xmm7
+ pshufb xmm2, xmm7
+ pshufb xmm3, xmm7
+
+ movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
+ ;imm value of pclmulqdq instruction
+ ;will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 128 instead of 64 to save one instruction from the loop
+ sub arg4, 128
+
+ ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ ; buffer. The _fold_64_B_loop
+ ; loop will fold 64B at a time until we have 64+y Bytes of buffer
+
+
+ ; fold 64B at a time. This section of the code folds 4 xmm
+ ; registers in parallel
+_fold_64_B_loop:
+
+ ; update the buffer pointer
+ add arg3, 64 ; buf += 64;
+ add arg2, 64
+
+ prefetchnta [arg3+fetch_dist+0]
+ movdqu xmm4, xmm0
+ movdqu xmm5, xmm1
+
+ pclmulqdq xmm0, xmm6 , 0x11
+ pclmulqdq xmm1, xmm6 , 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ prefetchnta [arg3+fetch_dist+32]
+ movdqu xmm4, xmm2
+ movdqu xmm5, xmm3
+
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ movdqu xmm4, [arg3]
+ movdqu xmm5, [arg3+16]
+ movdqu [arg2], xmm4
+ movdqu [arg2+16], xmm5
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ movdqu xmm4, [arg3+32]
+ movdqu xmm5, [arg3+48]
+ movdqu [arg2+32], xmm4
+ movdqu [arg2+48], xmm5
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ sub arg4, 64
+
+ ; check if there is another 64B in the buffer to be able to fold
+ jge _fold_64_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg3, 64
+ add arg2, 64
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
+
+
+ ; fold the 4 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will
+ ;determine which constant to use
+
+ movdqa xmm4, xmm0
+ pclmulqdq xmm0, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm1, xmm4
+ pxor xmm1, xmm0
+
+ movdqa xmm4, xmm1
+ pclmulqdq xmm1, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm2, xmm4
+ pxor xmm2, xmm1
+
+ movdqa xmm4, xmm2
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+
+ ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg4, 64-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes
+ ; is in register xmm3 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ movdqu xmm0, [arg3]
+ movdqu [arg2], xmm0
+ pshufb xmm0, xmm7
+ pxor xmm3, xmm0
+ add arg3, 16
+ add arg2, 16
+ sub arg4, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg4, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm3 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg4, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer,
+ ; we can offset the input pointer before the actual point,
+ ; to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm3
+
+ movdqu xmm1, [arg3 - 16 + arg4]
+ movdqu [arg2 - 16 + arg4], xmm1
+ pshufb xmm1, xmm7
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg4
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg4 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm3 to the right by 16-arg4 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
+ movdqa xmm0, xmm3
+
+ ;64b fold
+ pclmulqdq xmm3, xmm6, 0x1
+ pslldq xmm0, 8
+ pxor xmm3, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm3
+
+ pand xmm0, [mask2]
+
+ psrldq xmm3, 12
+ pclmulqdq xmm3, xmm6, 0x10
+ pxor xmm3, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
+ movdqa xmm0, xmm3
+ pclmulqdq xmm3, xmm6, 0x01
+ pslldq xmm3, 4
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pslldq xmm3, 4
+ pxor xmm3, xmm0
+ pextrd eax, xmm3,1
+
+_cleanup:
+ ; scale the result back to 16 bits
+ shr eax, 16
+ movdqa xmm6, [rsp+16*2]
+ movdqa xmm7, [rsp+16*3]
+ add rsp,16*4+8
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_128:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg4, 32
+ jl _less_than_32
+ movdqa xmm7, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+ movdqu xmm3, [arg3] ; load the plaintext
+ movdqu [arg2], xmm3 ; store copy
+ pshufb xmm3, xmm7 ; byte-reflect the plaintext
+ pxor xmm3, xmm0
+
+
+ ; update the buffer pointer
+ add arg3, 16
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg4, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg4, arg4
+ je _cleanup
+
+ movdqa xmm7, [SHUF_MASK]
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg4, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm3, [arg3] ; load the plaintext
+ movdqu [arg2], xmm3 ; store the copy
+ pshufb xmm3, xmm7 ; byte-reflect the plaintext
+ pxor xmm3, xmm0 ; xor the initial crc value
+ add arg3, 16
+ add arg2, 16
+ sub arg4, 16
+ movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ cmp arg4, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov tmp1, arg4
+ cmp arg4, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg3]
+ mov [arg2], rax
+ mov [r11], rax
+ add r11, 8
+ sub arg4, 8
+ add arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg4, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg3]
+ mov [arg2], eax
+ mov [r11], eax
+ add r11, 4
+ sub arg4, 4
+ add arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg4, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg3]
+ mov [arg2], ax
+ mov [r11], ax
+ add r11, 2
+ sub arg4, 2
+ add arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg4, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg3]
+ mov [arg2], al
+ mov [r11], al
+_zero_left:
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ ; shl tmp1, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, tmp1
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm3, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm3, [arg3]
+ movdqu [arg2], xmm3
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg4, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg3]
+ mov [arg2], al
+ mov [r11], al
+
+ mov al, [arg3+1]
+ mov [arg2+1], al
+ mov [r11+1], al
+
+ mov al, [arg3+2]
+ mov [arg2+2], al
+ mov [r11+2], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ psrldq xmm3, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg4, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg3]
+ mov [arg2], al
+ mov [r11], al
+
+ mov al, [arg3+1]
+ mov [arg2+1], al
+ mov [r11+1], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ psrldq xmm3, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg3]
+ mov [arg2],al
+ mov [r11], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0 ; xor the initial crc value
+
+ psrldq xmm3, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
+align 16
+; Q = 0x18BB70000
+; rk1 = 2^(32*3) mod Q << 32
+; rk2 = 2^(32*5) mod Q << 32
+; rk3 = 2^(32*15) mod Q << 32
+; rk4 = 2^(32*17) mod Q << 32
+; rk5 = 2^(32*3) mod Q << 32
+; rk6 = 2^(32*2) mod Q << 32
+; rk7 = floor(2^64/Q)
+; rk8 = Q
+rk1:
+DQ 0x2d56000000000000
+rk2:
+DQ 0x06df000000000000
+rk3:
+DQ 0x044c000000000000
+rk4:
+DQ 0xe658000000000000
+rk5:
+DQ 0x2d56000000000000
+rk6:
+DQ 0x1368000000000000
+rk7:
+DQ 0x00000001f65a57f8
+rk8:
+DQ 0x000000018bb70000
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+;;; func core, ver, snum
+slversion crc16_t10dif_copy_by4, 05, 02, 0000
diff --git a/src/isa-l/crc/crc16_t10dif_copy_by4_02.asm b/src/isa-l/crc/crc16_t10dif_copy_by4_02.asm
new file mode 100644
index 000000000..254a18711
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_copy_by4_02.asm
@@ -0,0 +1,596 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Function API:
+; UINT16 crc16_t10dif_copy_by4_02(
+; UINT16 init_crc, //initial CRC value, 16 bits
+; unsigned char *dst, //buffer pointer destination for copy
+; const unsigned char *src, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://download.intel.com/design/intarch/papers/323102.pdf
+;
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+ %xdefine arg4 r9
+ %xdefine tmp1 r10
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+ %xdefine arg4 rcx
+ %xdefine tmp1 r10
+ %xdefine arg1_low32 edi
+%endif
+
+align 16
+mk_global crc16_t10dif_copy_by4_02, function
+crc16_t10dif_copy_by4_02:
+ endbranch
+
+ ; adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl arg1_low32, 16
+
+ ; After this point, code flow is exactly same as a 32-bit CRC.
+ ; The only difference is before returning eax, we will shift
+ ; it right 16 bits, to scale back to 16 bits.
+
+ sub rsp,16*4+8
+
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp+16*2],xmm6
+ movdqa [rsp+16*3],xmm7
+
+ ; check if smaller than 128B
+ cmp arg4, 128
+
+ ; for sizes less than 128, we can't fold 64B at a time...
+ jl _less_than_128
+
+
+ ; load the initial crc value
+ vmovd xmm6, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to
+ ; be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with
+ ; initial crc at correct place.
+ vpslldq xmm6, 12
+
+ vmovdqa xmm7, [SHUF_MASK]
+ ; receive the initial 64B data, xor the initial crc value
+ vmovdqu xmm0, [arg3]
+ vmovdqu xmm1, [arg3+16]
+ vmovdqu xmm2, [arg3+32]
+ vmovdqu xmm3, [arg3+48]
+
+ ; copy initial data
+ vmovdqu [arg2], xmm0
+ vmovdqu [arg2+16], xmm1
+ vmovdqu [arg2+32], xmm2
+ vmovdqu [arg2+48], xmm3
+
+ vpshufb xmm0, xmm7
+ ; XOR the initial_crc value
+ vpxor xmm0, xmm6
+ vpshufb xmm1, xmm7
+ vpshufb xmm2, xmm7
+ vpshufb xmm3, xmm7
+
+ vmovdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
+ ;imm value of pclmulqdq instruction
+ ;will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 128 instead of 64 to save one instruction from the loop
+ sub arg4, 128
+
+ ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ ; buffer. The _fold_64_B_loop
+ ; loop will fold 64B at a time until we have 64+y Bytes of buffer
+
+
+ ; fold 64B at a time. This section of the code folds 4 xmm
+ ; registers in parallel
+_fold_64_B_loop:
+
+ ; update the buffer pointer
+ add arg3, 64 ; buf += 64;
+ add arg2, 64
+
+ prefetchnta [arg3+fetch_dist+0]
+ vmovdqu xmm4, xmm0
+ vmovdqu xmm5, xmm1
+
+ vpclmulqdq xmm0, xmm6 , 0x11
+ vpclmulqdq xmm1, xmm6 , 0x11
+
+ vpclmulqdq xmm4, xmm6, 0x0
+ vpclmulqdq xmm5, xmm6, 0x0
+
+ vpxor xmm0, xmm4
+ vpxor xmm1, xmm5
+
+ prefetchnta [arg3+fetch_dist+32]
+ vmovdqu xmm4, xmm2
+ vmovdqu xmm5, xmm3
+
+ vpclmulqdq xmm2, xmm6, 0x11
+ vpclmulqdq xmm3, xmm6, 0x11
+
+ vpclmulqdq xmm4, xmm6, 0x0
+ vpclmulqdq xmm5, xmm6, 0x0
+
+ vpxor xmm2, xmm4
+ vpxor xmm3, xmm5
+
+ vmovdqu xmm4, [arg3]
+ vmovdqu xmm5, [arg3+16]
+ vmovdqu [arg2], xmm4
+ vmovdqu [arg2+16], xmm5
+ vpshufb xmm4, xmm7
+ vpshufb xmm5, xmm7
+ vpxor xmm0, xmm4
+ vpxor xmm1, xmm5
+
+ vmovdqu xmm4, [arg3+32]
+ vmovdqu xmm5, [arg3+48]
+ vmovdqu [arg2+32], xmm4
+ vmovdqu [arg2+48], xmm5
+ vpshufb xmm4, xmm7
+ vpshufb xmm5, xmm7
+
+ vpxor xmm2, xmm4
+ vpxor xmm3, xmm5
+
+ sub arg4, 64
+
+ ; check if there is another 64B in the buffer to be able to fold
+ jge _fold_64_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg3, 64
+ add arg2, 64
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
+
+
+ ; fold the 4 xmm registers to 1 xmm register with different constants
+
+ vmovdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will
+ ;determine which constant to use
+
+ vmovdqa xmm4, xmm0
+ vpclmulqdq xmm0, xmm6, 0x11
+ vpclmulqdq xmm4, xmm6, 0x0
+ vpxor xmm1, xmm4
+ vpxor xmm1, xmm0
+
+ vmovdqa xmm4, xmm1
+ vpclmulqdq xmm1, xmm6, 0x11
+ vpclmulqdq xmm4, xmm6, 0x0
+ vpxor xmm2, xmm4
+ vpxor xmm2, xmm1
+
+ vmovdqa xmm4, xmm2
+ vpclmulqdq xmm2, xmm6, 0x11
+ vpclmulqdq xmm4, xmm6, 0x0
+ vpxor xmm3, xmm4
+ vpxor xmm3, xmm2
+
+
+ ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg4, 64-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes
+ ; is in register xmm3 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ vmovdqa xmm4, xmm3
+ vpclmulqdq xmm3, xmm6, 0x11
+ vpclmulqdq xmm4, xmm6, 0x0
+ vpxor xmm3, xmm4
+ vmovdqu xmm0, [arg3]
+ vmovdqu [arg2], xmm0
+ vpshufb xmm0, xmm7
+ vpxor xmm3, xmm0
+ add arg3, 16
+ add arg2, 16
+ sub arg4, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg4, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm3 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg4, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer,
+ ; we can offset the input pointer before the actual point,
+ ; to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ vmovdqa xmm2, xmm3
+
+ vmovdqu xmm1, [arg3 - 16 + arg4]
+ vmovdqu [arg2 - 16 + arg4], xmm1
+ vpshufb xmm1, xmm7
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg4
+ vmovdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg4 bytes
+ vpshufb xmm2, xmm0
+
+ ; shift xmm3 to the right by 16-arg4 bytes
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+ vpblendvb xmm1, xmm1, xmm2, xmm0
+
+ ; fold 16 Bytes
+ vmovdqa xmm2, xmm1
+ vmovdqa xmm4, xmm3
+ vpclmulqdq xmm3, xmm6, 0x11
+ vpclmulqdq xmm4, xmm6, 0x0
+ vpxor xmm3, xmm4
+ vpxor xmm3, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
+ vmovdqa xmm0, xmm3
+
+ ;64b fold
+ vpclmulqdq xmm3, xmm6, 0x1
+ vpslldq xmm0, 8
+ vpxor xmm3, xmm0
+
+ ;32b fold
+ vmovdqa xmm0, xmm3
+
+ vpand xmm0, [mask2]
+
+ vpsrldq xmm3, 12
+ vpclmulqdq xmm3, xmm6, 0x10
+ vpxor xmm3, xmm0
+
+ ;barrett reduction
+_barrett:
+ vmovdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
+ vmovdqa xmm0, xmm3
+ vpclmulqdq xmm3, xmm6, 0x01
+ vpslldq xmm3, 4
+ vpclmulqdq xmm3, xmm6, 0x11
+
+ vpslldq xmm3, 4
+ vpxor xmm3, xmm0
+ vpextrd eax, xmm3,1
+
+_cleanup:
+ ; scale the result back to 16 bits
+ shr eax, 16
+ vmovdqa xmm6, [rsp+16*2]
+ vmovdqa xmm7, [rsp+16*3]
+ add rsp,16*4+8
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_128:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg4, 32
+ jl _less_than_32
+ vmovdqa xmm7, [SHUF_MASK]
+
+ ; if there is, load the constants
+ vmovdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+ vmovdqu xmm3, [arg3] ; load the plaintext
+ vmovdqu [arg2], xmm3 ; store copy
+ vpshufb xmm3, xmm7 ; byte-reflect the plaintext
+ vpxor xmm3, xmm0
+
+
+ ; update the buffer pointer
+ add arg3, 16
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg4, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg4, arg4
+ je _cleanup
+
+ vmovdqa xmm7, [SHUF_MASK]
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg4, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ vmovdqu xmm3, [arg3] ; load the plaintext
+ vmovdqu [arg2], xmm3 ; store the copy
+ vpshufb xmm3, xmm7 ; byte-reflect the plaintext
+ vpxor xmm3, xmm0 ; xor the initial crc value
+ add arg3, 16
+ add arg2, 16
+ sub arg4, 16
+ vmovdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ cmp arg4, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov tmp1, arg4
+ cmp arg4, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg3]
+ mov [arg2], rax
+ mov [r11], rax
+ add r11, 8
+ sub arg4, 8
+ add arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg4, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg3]
+ mov [arg2], eax
+ mov [r11], eax
+ add r11, 4
+ sub arg4, 4
+ add arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg4, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg3]
+ mov [arg2], ax
+ mov [r11], ax
+ add r11, 2
+ sub arg4, 2
+ add arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg4, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg3]
+ mov [arg2], al
+ mov [r11], al
+_zero_left:
+ vmovdqa xmm3, [rsp]
+ vpshufb xmm3, xmm7
+ vpxor xmm3, xmm0 ; xor the initial crc value
+
+ ; shl tmp1, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, tmp1
+ vmovdqu xmm0, [rax]
+ vpxor xmm0, [mask1]
+
+ vpshufb xmm3, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ vmovdqu xmm3, [arg3]
+ vmovdqu [arg2], xmm3
+ vpshufb xmm3, xmm7
+ vpxor xmm3, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg4, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg3]
+ mov [arg2], al
+ mov [r11], al
+
+ mov al, [arg3+1]
+ mov [arg2+1], al
+ mov [r11+1], al
+
+ mov al, [arg3+2]
+ mov [arg2+2], al
+ mov [r11+2], al
+
+ vmovdqa xmm3, [rsp]
+ vpshufb xmm3, xmm7
+ vpxor xmm3, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm3, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg4, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg3]
+ mov [arg2], al
+ mov [r11], al
+
+ mov al, [arg3+1]
+ mov [arg2+1], al
+ mov [r11+1], al
+
+ vmovdqa xmm3, [rsp]
+ vpshufb xmm3, xmm7
+ vpxor xmm3, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm3, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg3]
+ mov [arg2],al
+ mov [r11], al
+
+ vmovdqa xmm3, [rsp]
+ vpshufb xmm3, xmm7
+ vpxor xmm3, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm3, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
+align 16
+; Q = 0x18BB70000
+; rk1 = 2^(32*3) mod Q << 32
+; rk2 = 2^(32*5) mod Q << 32
+; rk3 = 2^(32*15) mod Q << 32
+; rk4 = 2^(32*17) mod Q << 32
+; rk5 = 2^(32*3) mod Q << 32
+; rk6 = 2^(32*2) mod Q << 32
+; rk7 = floor(2^64/Q)
+; rk8 = Q
+rk1:
+DQ 0x2d56000000000000
+rk2:
+DQ 0x06df000000000000
+rk3:
+DQ 0x044c000000000000
+rk4:
+DQ 0xe658000000000000
+rk5:
+DQ 0x2d56000000000000
+rk6:
+DQ 0x1368000000000000
+rk7:
+DQ 0x00000001f65a57f8
+rk8:
+DQ 0x000000018bb70000
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
diff --git a/src/isa-l/crc/crc16_t10dif_copy_perf.c b/src/isa-l/crc/crc16_t10dif_copy_perf.c
new file mode 100644
index 000000000..17cba6bc0
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_copy_perf.c
@@ -0,0 +1,84 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+ void *src, *dst;
+ uint16_t crc;
+ struct perf start;
+
+ printf("crc16_t10dif_copy_perf:\n");
+
+ if (posix_memalign(&src, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ if (posix_memalign(&dst, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ memset(src, 0, TEST_LEN);
+ BENCHMARK(&start, BENCHMARK_TIME, crc =
+ crc16_t10dif_copy(TEST_SEED, dst, src, TEST_LEN));
+ printf("crc16_t10dif_copy" TEST_TYPE_STR ": ");
+ perf_print(start, (long long)TEST_LEN);
+
+ printf("finish 0x%x\n", crc);
+ return 0;
+}
diff --git a/src/isa-l/crc/crc16_t10dif_copy_test.c b/src/isa-l/crc/crc16_t10dif_copy_test.c
new file mode 100644
index 000000000..4c398c429
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_copy_test.c
@@ -0,0 +1,175 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "crc.h"
+#include "crc_ref.h"
+
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define MAX_BUF 2345
+#define TEST_SIZE 217
+#define TEST_LEN (8 * 1024)
+
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+// bitwise crc version
+uint16_t crc16_t10dif_copy_ref(uint16_t seed, uint8_t * dst, uint8_t * src, uint64_t len);
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int memtst(unsigned char *buf, unsigned char c, int len)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ if (*buf++ != c)
+ return 1;
+
+ return 0;
+}
+
+int crc_copy_check(const char *description, u8 * dst, u8 * src, u8 dst_fill_val, int len,
+ int tot)
+{
+ u16 seed;
+ int rem;
+
+ assert(tot >= len);
+ seed = rand();
+ rem = tot - len;
+ memset(dst, dst_fill_val, tot);
+
+ // multi-binary crc version
+ u16 crc_dut = crc16_t10dif_copy(seed, dst, src, len);
+ u16 crc_ref = crc16_t10dif(seed, src, len);
+ if (crc_dut != crc_ref) {
+ printf("%s, crc gen fail: 0x%4x 0x%4x len=%d\n", description, crc_dut,
+ crc_ref, len);
+ return 1;
+ } else if (memcmp(dst, src, len)) {
+ printf("%s, copy fail: len=%d\n", description, len);
+ return 1;
+ } else if (memtst(&dst[len], dst_fill_val, rem)) {
+ printf("%s, writeover fail: len=%d\n", description, len);
+ return 1;
+ }
+ // bitwise crc version
+ crc_dut = crc16_t10dif_copy_ref(seed, dst, src, len);
+ crc_ref = crc16_t10dif_ref(seed, src, len);
+ if (crc_dut != crc_ref) {
+ printf("%s, crc gen fail (table-driven): 0x%4x 0x%4x len=%d\n", description,
+ crc_dut, crc_ref, len);
+ return 1;
+ } else if (memcmp(dst, src, len)) {
+ printf("%s, copy fail (table driven): len=%d\n", description, len);
+ return 1;
+ } else if (memtst(&dst[len], dst_fill_val, rem)) {
+ printf("%s, writeover fail (table driven): len=%d\n", description, len);
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int r = 0;
+ int i;
+ int len, tot;
+ u8 *src_raw, *dst_raw;
+ u8 *src, *dst;
+
+ printf("Test crc16_t10dif_copy_test:\n");
+ src_raw = (u8 *) malloc(TEST_LEN);
+ dst_raw = (u8 *) malloc(TEST_LEN);
+ if (NULL == src_raw || NULL == dst_raw) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ src = src_raw;
+ dst = dst_raw;
+
+ srand(TEST_SEED);
+
+ // Test of all zeros
+ memset(src, 0, TEST_LEN);
+ r |= crc_copy_check("zero tst", dst, src, 0x5e, MAX_BUF, TEST_LEN);
+
+ // Another simple test pattern
+ memset(src, 0xff, TEST_LEN);
+ r |= crc_copy_check("simp tst", dst, src, 0x5e, MAX_BUF, TEST_LEN);
+
+ // Do a few short len random data tests
+ rand_buffer(src, TEST_LEN);
+ rand_buffer(dst, TEST_LEN);
+ for (i = 0; i < MAX_BUF; i++) {
+ r |= crc_copy_check("short len", dst, src, rand(), i, MAX_BUF);
+ }
+ printf(".");
+
+ // Do a few longer tests, random data
+ for (i = TEST_LEN; i >= (TEST_LEN - TEST_SIZE); i--) {
+ r |= crc_copy_check("long len", dst, src, rand(), i, TEST_LEN);
+ }
+ printf(".");
+
+ // Do random size, random data
+ for (i = 0; i < RANDOMS; i++) {
+ len = rand() % TEST_LEN;
+ r |= crc_copy_check("rand len", dst, src, rand(), len, TEST_LEN);
+ }
+ printf(".");
+
+ // Run tests at end of buffer
+ for (i = 0; i < RANDOMS; i++) {
+ len = rand() % TEST_LEN;
+ src = &src_raw[TEST_LEN - len - 1];
+ dst = &dst_raw[TEST_LEN - len - 1];
+ tot = len;
+ r |= crc_copy_check("end of buffer", dst, src, rand(), len, tot);
+ }
+ printf(".");
+
+ printf("Test done: %s\n", r ? "Fail" : "Pass");
+ return r;
+}
diff --git a/src/isa-l/crc/crc16_t10dif_op_perf.c b/src/isa-l/crc/crc16_t10dif_op_perf.c
new file mode 100644
index 000000000..9b91ef39d
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_op_perf.c
@@ -0,0 +1,116 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "crc.h"
+#include "test.h"
+
+#define BLKSIZE (512)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define NBLOCKS 100
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define NBLOCKS (TEST_LEN / BLKSIZE)
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+struct blk {
+ uint8_t data[BLKSIZE];
+};
+
+struct blk_ext {
+ uint8_t data[BLKSIZE];
+ uint32_t tag;
+ uint16_t meta;
+ uint16_t crc;
+};
+
+void crc16_t10dif_copy_perf(struct blk *blks, struct blk *blkp, struct blk_ext *blks_ext,
+ struct blk_ext *blkp_ext, uint16_t * crc)
+{
+ int i;
+ for (i = 0, blkp = blks, blkp_ext = blks_ext; i < NBLOCKS; i++) {
+ *crc = crc16_t10dif_copy(TEST_SEED, blkp_ext->data, blkp->data,
+ sizeof(blks->data));
+ blkp_ext->crc = *crc;
+ blkp++;
+ blkp_ext++;
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ uint16_t crc;
+ struct blk *blks, *blkp;
+ struct blk_ext *blks_ext, *blkp_ext;
+ struct perf start;
+
+ printf("crc16_t10dif_streaming_insert_perf:\n");
+
+ if (posix_memalign((void *)&blks, 1024, NBLOCKS * sizeof(*blks))) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ if (posix_memalign((void *)&blks_ext, 1024, NBLOCKS * sizeof(*blks_ext))) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+
+ printf(" size blk: %ld, blk_ext: %ld, blk data: %ld, stream: %ld\n",
+ sizeof(*blks), sizeof(*blks_ext), sizeof(blks->data),
+ NBLOCKS * sizeof(blks->data));
+ memset(blks, 0xe5, NBLOCKS * sizeof(*blks));
+ memset(blks_ext, 0xe5, NBLOCKS * sizeof(*blks_ext));
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ // Copy and insert test
+ BENCHMARK(&start, BENCHMARK_TIME,
+ crc16_t10dif_copy_perf(blks, blkp, blks_ext, blkp_ext, &crc));
+
+ printf("crc16_t10pi_op_copy_insert" TEST_TYPE_STR ": ");
+ perf_print(start, (long long)sizeof(blks->data) * NBLOCKS);
+
+ printf("finish 0x%x\n", crc);
+ return 0;
+}
diff --git a/src/isa-l/crc/crc16_t10dif_perf.c b/src/isa-l/crc/crc16_t10dif_perf.c
new file mode 100644
index 000000000..7b7c0bcd9
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_perf.c
@@ -0,0 +1,79 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+ void *buf;
+ uint16_t crc;
+ struct perf start;
+
+ printf("crc16_t10dif_perf:\n");
+
+ if (posix_memalign(&buf, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ memset(buf, 0, TEST_LEN);
+ BENCHMARK(&start, BENCHMARK_TIME, crc = crc16_t10dif(TEST_SEED, buf, TEST_LEN));
+ printf("crc16_t10dif" TEST_TYPE_STR ": ");
+ perf_print(start, (long long)TEST_LEN);
+
+ printf("finish 0x%x\n", crc);
+ return 0;
+}
diff --git a/src/isa-l/crc/crc16_t10dif_test.c b/src/isa-l/crc/crc16_t10dif_test.c
new file mode 100644
index 000000000..ceb9aab45
--- /dev/null
+++ b/src/isa-l/crc/crc16_t10dif_test.c
@@ -0,0 +1,179 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "crc.h"
+#include "types.h"
+#include "crc_ref.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define MAX_BUF 4096
+#define TEST_SIZE 20
+
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+uint16_t crc16_t10dif_ref(uint16_t seed, uint8_t * buf, uint64_t len);
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0;
+ u32 r = 0;
+ int verbose = argc - 1;
+ int i, s;
+ void *buf_raw;
+ unsigned char *buf;
+
+ printf("Test crc16_t10dif_test ");
+ if (posix_memalign(&buf_raw, 32, MAX_BUF * TEST_SIZE)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ buf = (unsigned char *)buf_raw;
+
+ srand(TEST_SEED);
+
+ // Test of all zeros
+ memset(buf, 0, MAX_BUF * 10);
+ u16 crc_ref = crc16_t10dif_ref(TEST_SEED, buf, MAX_BUF);
+ u16 crc_base = crc16_t10dif_base(TEST_SEED, buf, MAX_BUF);
+ u16 crc = crc16_t10dif(TEST_SEED, buf, MAX_BUF);
+ if ((crc_base != crc_ref) || (crc != crc_ref)) {
+ fail++;
+ printf("\n opt ref\n");
+ printf(" ------ ------\n");
+ printf("crc zero = 0x%4x 0x%4x 0x%4x \n", crc_ref, crc_base, crc);
+ } else
+ printf(".");
+
+ // Another simple test pattern
+ memset(buf, 0x8a, MAX_BUF);
+ crc_ref = crc16_t10dif_ref(TEST_SEED, buf, MAX_BUF);
+ crc_base = crc16_t10dif_base(TEST_SEED, buf, MAX_BUF);
+ crc = crc16_t10dif(TEST_SEED, buf, MAX_BUF);
+ if ((crc_base != crc_ref) || (crc != crc_ref)) {
+ fail++;
+ printf("crc all 8a = 0x%4x 0x%4x 0x%4x\n", crc_ref, crc_base, crc);
+ } else
+ printf(".");
+
+ // Do a few random tests
+
+ rand_buffer(buf, MAX_BUF * TEST_SIZE);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc_ref = crc16_t10dif_ref(TEST_SEED, buf, MAX_BUF);
+ crc_base = crc16_t10dif_base(TEST_SEED, buf, MAX_BUF);
+ crc = crc16_t10dif(TEST_SEED, buf, MAX_BUF);
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%4x 0x%4x 0x%4x\n", i, crc_ref, crc_base, crc);
+ else if (i % (TEST_SIZE / 8) == 0)
+ printf(".");
+ buf += MAX_BUF;
+ }
+
+ // Do a few random sizes
+ buf = (unsigned char *)buf_raw; //reset buf
+ r = rand();
+
+ for (i = MAX_BUF; i >= 0; i--) {
+ crc_ref = crc16_t10dif_ref(r, buf, i);
+ crc_base = crc16_t10dif_base(r, buf, i);
+ crc = crc16_t10dif(r, buf, i);
+ if ((crc_base != crc_ref) || (crc != crc_ref)) {
+ fail++;
+ printf("fail random size%i 0x%8x 0x%8x 0x%8x\n", i, crc_ref, crc_base,
+ crc);
+ } else if (i % (MAX_BUF / 8) == 0)
+ printf(".");
+ }
+
+ // Try different seeds
+ for (s = 0; s < 20; s++) {
+ buf = (unsigned char *)buf_raw; //reset buf
+
+ r = rand(); // just to get a new seed
+ rand_buffer(buf, MAX_BUF * TEST_SIZE); // new pseudo-rand data
+
+ if (verbose)
+ printf("seed = 0x%x\n", r);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc_ref = crc16_t10dif_ref(r, buf, MAX_BUF);
+ crc_base = crc16_t10dif_base(r, buf, MAX_BUF);
+ crc = crc16_t10dif(r, buf, MAX_BUF);
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%4x 0x%4x 0x%4x\n", i, crc_ref,
+ crc_base, crc);
+ else if (i % (TEST_SIZE * 20 / 8) == 0)
+ printf(".");
+ buf += MAX_BUF;
+ }
+ }
+
+ // Run tests at end of buffer
+ buf = (unsigned char *)buf_raw; //reset buf
+ buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc_ref = crc16_t10dif_ref(TEST_SEED, buf + i, TEST_SIZE - i);
+ crc_base = crc16_t10dif_base(TEST_SEED, buf + i, TEST_SIZE - i);
+ crc = crc16_t10dif(TEST_SEED, buf + i, TEST_SIZE - i);
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc eob rand%3d = 0x%4x 0x%4x 0x%4x\n", i, crc_ref, crc_base,
+ crc);
+ else
+ printf(".");
+ }
+
+ printf("Test done: %s\n", fail ? "Fail" : "Pass");
+ if (fail)
+ printf("\nFailed %d tests\n", fail);
+
+ return fail;
+}
diff --git a/src/isa-l/crc/crc32_funcs_test.c b/src/isa-l/crc/crc32_funcs_test.c
new file mode 100644
index 000000000..e28da4018
--- /dev/null
+++ b/src/isa-l/crc/crc32_funcs_test.c
@@ -0,0 +1,324 @@
+/**********************************************************************
+ Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "crc.h"
+#include "types.h"
+#include "crc_ref.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define MAX_BUF 4096
+#define TEST_SIZE 32
+
+typedef uint32_t(*crc32_func_t) (uint32_t, const uint8_t *, uint64_t);
+typedef uint32_t(*crc32_func_t_base) (uint32_t, uint8_t *, uint64_t);
+typedef uint32_t(*crc32_func_t_ref) (uint32_t, uint8_t *, uint64_t);
+
+typedef struct func_case {
+ char *note;
+ crc32_func_t crc32_func_call;
+ crc32_func_t_base crc32_base_call;
+ crc32_func_t_ref crc32_ref_call;
+} func_case_t;
+
+uint32_t crc32_iscsi_wrap(uint32_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc32_iscsi((uint8_t *) buf, len, seed);
+}
+
+uint32_t crc32_iscsi_base_wrap(uint32_t seed, uint8_t * buf, uint64_t len)
+{
+ return crc32_iscsi_base(buf, len, seed);
+}
+
+uint32_t crc32_iscsi_ref_wrap(uint32_t seed, uint8_t * buf, uint64_t len)
+{
+ return crc32_iscsi_ref(buf, len, seed);
+}
+
+func_case_t test_funcs[] = {
+ {"crc32_ieee", crc32_ieee, crc32_ieee_base, crc32_ieee_ref}
+ ,
+ {"crc32_gzip_refl", crc32_gzip_refl, crc32_gzip_refl_base, crc32_gzip_refl_ref}
+ ,
+ {"crc32_iscsi", crc32_iscsi_wrap, crc32_iscsi_base_wrap, crc32_iscsi_ref_wrap}
+};
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+// Test cases
+int zeros_test(func_case_t * test_func);
+
+int simple_pattern_test(func_case_t * test_func);
+
+int seeds_sizes_test(func_case_t * test_func);
+
+int eob_test(func_case_t * test_func);
+
+int update_test(func_case_t * test_func);
+
+int verbose = 0;
+void *buf_alloc = NULL;
+
+int main(int argc, char *argv[])
+{
+ int fail = 0, fail_case;
+ int i, ret;
+ func_case_t *test_func;
+
+ verbose = argc - 1;
+
+ // Align to TEST_SIZE boundary
+ ret = posix_memalign(&buf_alloc, TEST_SIZE, MAX_BUF * TEST_SIZE);
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ srand(TEST_SEED);
+ printf("CRC32 Tests\n");
+
+ for (i = 0; i < sizeof(test_funcs) / sizeof(test_funcs[0]); i++) {
+ fail_case = 0;
+ test_func = &test_funcs[i];
+
+ printf("Test %s\t", test_func->note);
+ fail_case += zeros_test(test_func);
+ fail_case += simple_pattern_test(test_func);
+ fail_case += seeds_sizes_test(test_func);
+ fail_case += eob_test(test_func);
+ fail_case += update_test(test_func);
+ printf(" done: %s\n", fail_case ? "Fail" : "Pass");
+
+ if (fail_case) {
+ printf("\n%s Failed %d tests\n", test_func->note, fail_case);
+ fail++;
+ }
+ }
+
+ printf("CRC32 Tests all done: %s\n", fail ? "Fail" : "Pass");
+
+ return fail;
+}
+
+// Test of all zeros
+int zeros_test(func_case_t * test_func)
+{
+ uint32_t crc_ref, crc_base, crc;
+ int fail = 0;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc;
+ memset(buf, 0, MAX_BUF * 10);
+ crc_ref = test_func->crc32_ref_call(TEST_SEED, buf, MAX_BUF * 10);
+ crc_base = test_func->crc32_base_call(TEST_SEED, buf, MAX_BUF * 10);
+ crc = test_func->crc32_func_call(TEST_SEED, buf, MAX_BUF * 10);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref)) {
+ fail++;
+ printf("\n opt ref\n");
+ printf(" ------ ------\n");
+ printf("crc zero = 0x%8x 0x%8x 0x%8x\n", crc_ref, crc_base, crc);
+ } else
+ printf(".");
+
+ return fail;
+}
+
+// Another simple test pattern
+int simple_pattern_test(func_case_t * test_func)
+{
+ uint32_t crc_ref, crc_base, crc;
+ int fail = 0;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc;
+ memset(buf, 0x8a, MAX_BUF);
+ crc_ref = test_func->crc32_ref_call(TEST_SEED, buf, MAX_BUF);
+ crc_base = test_func->crc32_base_call(TEST_SEED, buf, MAX_BUF);
+ crc = test_func->crc32_func_call(TEST_SEED, buf, MAX_BUF);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc all 8a = 0x%8x 0x%8x 0x%8x\n", crc_ref, crc_base, crc);
+ else
+ printf(".");
+
+ return fail;
+}
+
+int seeds_sizes_test(func_case_t * test_func)
+{
+ uint32_t crc_ref, crc_base, crc;
+ int fail = 0;
+ int i;
+ uint64_t r, s;
+ unsigned char *buf = NULL;
+
+ // Do a few random tests
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+ rand_buffer(buf, MAX_BUF * TEST_SIZE);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc_ref = test_func->crc32_ref_call(r, buf, MAX_BUF);
+ crc_base = test_func->crc32_base_call(r, buf, MAX_BUF);
+ crc = test_func->crc32_func_call(r, buf, MAX_BUF);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%8x 0x%8x 0x%8x\n", i, crc_ref, crc_base, crc);
+ else if (i % (TEST_SIZE / 8) == 0)
+ printf(".");
+ buf += MAX_BUF;
+ }
+
+ // Do a few random sizes
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+
+ for (i = MAX_BUF; i >= 0; i--) {
+ crc_ref = test_func->crc32_ref_call(r, buf, i);
+ crc_base = test_func->crc32_base_call(r, buf, i);
+ crc = test_func->crc32_func_call(r, buf, i);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref)) {
+ fail++;
+ printf("fail random size%i 0x%8x 0x%8x 0x%8x\n", i, crc_ref, crc_base,
+ crc);
+ } else if (i % (MAX_BUF / 8) == 0)
+ printf(".");
+ }
+
+ // Try different seeds
+ for (s = 0; s < 20; s++) {
+ buf = (unsigned char *)buf_alloc; //reset buf
+
+ r = rand(); // just to get a new seed
+ rand_buffer(buf, MAX_BUF * TEST_SIZE); // new pseudo-rand data
+
+ if (verbose)
+ printf("seed = 0x%lx\n", r);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc_ref = test_func->crc32_ref_call(r, buf, MAX_BUF);
+ crc_base = test_func->crc32_base_call(r, buf, MAX_BUF);
+ crc = test_func->crc32_func_call(r, buf, MAX_BUF);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%8x 0x%8x 0x%8x\n", i, crc_ref,
+ crc_base, crc);
+ else if (i % (TEST_SIZE * 20 / 8) == 0)
+ printf(".");
+ buf += MAX_BUF;
+ }
+ }
+
+ return fail;
+}
+
+// Run tests at end of buffer
+int eob_test(func_case_t * test_func)
+{
+ uint32_t crc_ref, crc_base, crc;
+ int fail = 0;
+ int i;
+ unsigned char *buf = NULL;
+
+ // Null test
+ if (0 != test_func->crc32_func_call(0, NULL, 0)) {
+ fail++;
+ printf("crc null test fail\n");
+ }
+
+ buf = (unsigned char *)buf_alloc; //reset buf
+ buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end
+ for (i = 0; i <= TEST_SIZE; i++) {
+ crc_ref = test_func->crc32_ref_call(TEST_SEED, buf + i, TEST_SIZE - i);
+ crc_base = test_func->crc32_base_call(TEST_SEED, buf + i, TEST_SIZE - i);
+ crc = test_func->crc32_func_call(TEST_SEED, buf + i, TEST_SIZE - i);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc eob rand%3d = 0x%8x 0x%8x 0x%8x\n", i, crc_ref, crc_base,
+ crc);
+ else if (i % (TEST_SIZE / 8) == 0)
+ printf(".");
+ }
+
+ return fail;
+}
+
+int update_test(func_case_t * test_func)
+{
+ uint32_t crc_ref, crc_base, crc;
+ int fail = 0;
+ int i;
+ uint64_t r;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+ // Process the whole buf with reference func single call.
+ crc_ref = test_func->crc32_ref_call(r, buf, MAX_BUF * TEST_SIZE);
+ crc_base = test_func->crc32_base_call(r, buf, MAX_BUF * TEST_SIZE);
+ // Process buf with update method.
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = test_func->crc32_func_call(r, buf, MAX_BUF);
+ // Update crc seeds and buf pointer.
+ r = crc;
+ buf += MAX_BUF;
+ }
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%8x 0x%8x 0x%8x\n", i, crc_ref, crc_base, crc);
+ else
+ printf(".");
+
+ return fail;
+}
diff --git a/src/isa-l/crc/crc32_gzip_refl_by16_10.asm b/src/isa-l/crc/crc32_gzip_refl_by16_10.asm
new file mode 100644
index 000000000..15280b8cf
--- /dev/null
+++ b/src/isa-l/crc/crc32_gzip_refl_by16_10.asm
@@ -0,0 +1,569 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; UINT32 crc32_gzip_refl_by16_10(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://download.intel.com/design/intarch/papers/323102.pdf
+;
+;
+; sample yasm command line:
+; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip_refl_by8
+;
+; As explained here:
+; http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
+; CRC-32 checksum is described in RFC 1952
+; Implementing RFC 1952 CRC:
+; http://www.ietf.org/rfc/rfc1952.txt
+
+%include "reg_sizes.asm"
+
+%ifndef FUNCTION_NAME
+%define FUNCTION_NAME crc32_gzip_refl_by16_10
+%endif
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*12+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global FUNCTION_NAME, function
+FUNCTION_NAME:
+ endbranch
+
+ not arg1_low32
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+ jl .less_than_256
+
+ ; load the initial crc value
+ vmovd xmm10, arg1_low32 ; initial crc
+
+ ; receive the initial 64B data, xor the initial crc value
+ vmovdqu8 zmm0, [arg2+16*0]
+ vmovdqu8 zmm4, [arg2+16*4]
+ vpxorq zmm0, zmm10
+ vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+
+ sub arg3, 256
+ cmp arg3, 256
+ jl .fold_128_B_loop
+
+ vmovdqu8 zmm7, [arg2+16*8]
+ vmovdqu8 zmm8, [arg2+16*12]
+ vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
+ sub arg3, 256
+
+.fold_256_B_loop:
+ add arg2, 256
+ vmovdqu8 zmm3, [arg2+16*0]
+ vpclmulqdq zmm1, zmm0, zmm16, 0x10
+ vpclmulqdq zmm2, zmm0, zmm16, 0x01
+ vpxorq zmm0, zmm1, zmm2
+ vpxorq zmm0, zmm0, zmm3
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpclmulqdq zmm5, zmm4, zmm16, 0x10
+ vpclmulqdq zmm6, zmm4, zmm16, 0x01
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ vmovdqu8 zmm11, [arg2+16*8]
+ vpclmulqdq zmm12, zmm7, zmm16, 0x10
+ vpclmulqdq zmm13, zmm7, zmm16, 0x01
+ vpxorq zmm7, zmm12, zmm13
+ vpxorq zmm7, zmm7, zmm11
+
+ vmovdqu8 zmm17, [arg2+16*12]
+ vpclmulqdq zmm14, zmm8, zmm16, 0x10
+ vpclmulqdq zmm15, zmm8, zmm16, 0x01
+ vpxorq zmm8, zmm14, zmm15
+ vpxorq zmm8, zmm8, zmm17
+
+ sub arg3, 256
+ jge .fold_256_B_loop
+
+ ;; Fold 256 into 128
+ add arg2, 256
+ vpclmulqdq zmm1, zmm0, zmm10, 0x01
+ vpclmulqdq zmm2, zmm0, zmm10, 0x10
+ vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
+
+ vpclmulqdq zmm5, zmm4, zmm10, 0x01
+ vpclmulqdq zmm6, zmm4, zmm10, 0x10
+ vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
+
+ vmovdqa32 zmm0, zmm7
+ vmovdqa32 zmm4, zmm8
+
+ add arg3, 128
+ jmp .fold_128_B_register
+
+
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+.fold_128_B_loop:
+ add arg2, 128
+ vmovdqu8 zmm8, [arg2+16*0]
+ vpclmulqdq zmm2, zmm0, zmm10, 0x10
+ vpclmulqdq zmm1, zmm0, zmm10, 0x01
+ vpxorq zmm0, zmm2, zmm1
+ vpxorq zmm0, zmm0, zmm8
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpclmulqdq zmm5, zmm4, zmm10, 0x10
+ vpclmulqdq zmm6, zmm4, zmm10, 0x01
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ sub arg3, 128
+ jge .fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+.fold_128_B_register:
+ ; fold the 8 128b parts into 1 xmm register with different constants
+ vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
+ vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
+ vpclmulqdq zmm1, zmm0, zmm16, 0x01
+ vpclmulqdq zmm2, zmm0, zmm16, 0x10
+ vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
+
+ vpclmulqdq zmm5, zmm4, zmm11, 0x01
+ vpclmulqdq zmm6, zmm4, zmm11, 0x10
+ vmovdqa xmm10, [rk1] ; Needed later in reduction loop
+ vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
+ vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
+
+ vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
+ vpxorq ymm8, ymm8, ymm1
+ vextracti64x2 xmm5, ymm8, 1
+ vpxorq xmm7, xmm5, xmm8
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl .final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+.16B_reduction_loop:
+ vpclmulqdq xmm8, xmm7, xmm10, 0x1
+ vpclmulqdq xmm7, xmm7, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vmovdqu xmm0, [arg2]
+ vpxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge .16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+.final_reduction_for_128:
+ add arg3, 16
+ je .128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset
+ ; the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+.get_last_two_xmms:
+
+ vmovdqa xmm2, xmm7
+ vmovdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ vmovdqu xmm0, [rax]
+
+ vpshufb xmm7, xmm0
+ vpxor xmm0, [mask3]
+ vpshufb xmm2, xmm0
+
+ vpblendvb xmm2, xmm2, xmm1, xmm0
+ ;;;;;;;;;;
+ vpclmulqdq xmm8, xmm7, xmm10, 0x1
+ vpclmulqdq xmm7, xmm7, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+.128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm10, [rk5]
+ vmovdqa xmm0, xmm7
+
+ ;64b fold
+ vpclmulqdq xmm7, xmm10, 0
+ vpsrldq xmm0, 8
+ vpxor xmm7, xmm0
+
+ ;32b fold
+ vmovdqa xmm0, xmm7
+ vpslldq xmm7, 4
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm0
+
+
+ ;barrett reduction
+.barrett:
+ vpand xmm7, [mask2]
+ vmovdqa xmm1, xmm7
+ vmovdqa xmm2, xmm7
+ vmovdqa xmm10, [rk7]
+
+ vpclmulqdq xmm7, xmm10, 0
+ vpxor xmm7, xmm2
+ vpand xmm7, [mask]
+ vmovdqa xmm2, xmm7
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm2
+ vpxor xmm7, xmm1
+ vpextrd eax, xmm7, 2
+
+.cleanup:
+ not eax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+.less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl .less_than_32
+
+ ; if there is, load the constants
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp .16B_reduction_loop
+
+
+align 16
+.less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je .cleanup
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+
+ cmp arg3, 16
+ je .exact_16_left
+ jl .less_than_16_left
+
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp .get_last_two_xmms
+
+align 16
+.less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl .only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl .less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+.less_than_8_left:
+
+ cmp arg3, 4
+ jl .less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+.less_than_4_left:
+
+ cmp arg3, 2
+ jl .less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+.less_than_2_left:
+ cmp arg3, 1
+ jl .zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+.zero_left:
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+ vmovdqu xmm0, [rax + r9]
+ vpshufb xmm7,xmm0
+ jmp .128_done
+
+align 16
+.exact_16_left:
+ vmovdqu xmm7, [arg2]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ jmp .128_done
+
+.only_less_than_4:
+ cmp arg3, 3
+ jl .only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpslldq xmm7, 5
+ jmp .barrett
+
+.only_less_than_3:
+ cmp arg3, 2
+ jl .only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpslldq xmm7, 6
+ jmp .barrett
+
+.only_less_than_2:
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpslldq xmm7, 7
+ jmp .barrett
+
+section .data
+align 32
+
+%ifndef USE_CONSTS
+; precomputed constants
+rk_1: dq 0x00000000e95c1271
+rk_2: dq 0x00000000ce3371cb
+rk1: dq 0x00000000ccaa009e
+rk2: dq 0x00000001751997d0
+rk3: dq 0x000000014a7fe880
+rk4: dq 0x00000001e88ef372
+rk5: dq 0x00000000ccaa009e
+rk6: dq 0x0000000163cd6124
+rk7: dq 0x00000001f7011640
+rk8: dq 0x00000001db710640
+rk9: dq 0x00000001d7cfc6ac
+rk10: dq 0x00000001ea89367e
+rk11: dq 0x000000018cb44e58
+rk12: dq 0x00000000df068dc2
+rk13: dq 0x00000000ae0b5394
+rk14: dq 0x00000001c7569e54
+rk15: dq 0x00000001c6e41596
+rk16: dq 0x0000000154442bd4
+rk17: dq 0x0000000174359406
+rk18: dq 0x000000003db1ecdc
+rk19: dq 0x000000015a546366
+rk20: dq 0x00000000f1da05aa
+
+rk_1b: dq 0x00000000ccaa009e
+rk_2b: dq 0x00000001751997d0
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+%else
+INCLUDE_CONSTS
+%endif
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3: dq 0x8080808080808080, 0x8080808080808080
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_ %+ FUNCTION_NAME
+no_ %+ FUNCTION_NAME %+ :
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/isa-l/crc/crc32_gzip_refl_by8.asm b/src/isa-l/crc/crc32_gzip_refl_by8.asm
new file mode 100644
index 000000000..43840244a
--- /dev/null
+++ b/src/isa-l/crc/crc32_gzip_refl_by8.asm
@@ -0,0 +1,625 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; UINT32 crc32_gzip_refl_by8(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://download.intel.com/design/intarch/papers/323102.pdf
+;
+;
+; sample yasm command line:
+; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip_refl_by8
+;
+; As explained here:
+; http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
+; CRC-32 checksum is described in RFC 1952
+; Implementing RFC 1952 CRC:
+; http://www.ietf.org/rfc/rfc1952.txt
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global crc32_gzip_refl_by8, function
+crc32_gzip_refl_by8:
+ endbranch
+
+ ; unsigned long c = crc ^ 0xffffffffL;
+ not arg1_low32 ;
+
+
+ sub rsp, VARIABLE_OFFSET
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movd xmm10, arg1_low32 ; initial crc
+
+ ; receive the initial 64B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm1, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm3, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm5, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm7, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1]
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ add arg3, 16
+ je _128_done
+
+; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+
+
+ movdqa xmm2, xmm7
+ movdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ movdqu xmm0, [rax]
+
+
+ pshufb xmm7, xmm0
+ pxor xmm0, [mask3]
+ pshufb xmm2, xmm0
+
+ pblendvb xmm2, xmm1 ;xmm0 is implicit
+ ;;;;;;;;;;
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5]
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0
+ psrldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm7
+ pslldq xmm7, 4
+ pclmulqdq xmm7, xmm10, 0x10
+
+ pxor xmm7, xmm0
+
+
+ ;barrett reduction
+_barrett:
+ pand xmm7, [mask2]
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm7
+ movdqa xmm10, [rk7]
+
+ pclmulqdq xmm7, xmm10, 0
+ pxor xmm7, xmm2
+ pand xmm7, [mask]
+ movdqa xmm2, xmm7
+ pclmulqdq xmm7, xmm10, 0x10
+ pxor xmm7, xmm2
+ pxor xmm7, xmm1
+ pextrd eax, xmm7, 2
+
+_cleanup:
+ ; return c ^ 0xffffffffL;
+ not eax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+ movdqu xmm0, [rax + r9]
+ pshufb xmm7,xmm0
+
+
+
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ pslldq xmm7, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ pslldq xmm7, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ pslldq xmm7, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+align 16
+rk1 :
+DQ 0x00000000ccaa009e
+rk2 :
+DQ 0x00000001751997d0
+rk3 :
+DQ 0x000000014a7fe880
+rk4 :
+DQ 0x00000001e88ef372
+rk5 :
+DQ 0x00000000ccaa009e
+rk6 :
+DQ 0x0000000163cd6124
+rk7 :
+DQ 0x00000001f7011640
+rk8 :
+DQ 0x00000001db710640
+rk9 :
+DQ 0x00000001d7cfc6ac
+rk10 :
+DQ 0x00000001ea89367e
+rk11 :
+DQ 0x000000018cb44e58
+rk12 :
+DQ 0x00000000df068dc2
+rk13 :
+DQ 0x00000000ae0b5394
+rk14 :
+DQ 0x00000001c7569e54
+rk15 :
+DQ 0x00000001c6e41596
+rk16 :
+DQ 0x0000000154442bd4
+rk17 :
+DQ 0x0000000174359406
+rk18 :
+DQ 0x000000003db1ecdc
+rk19 :
+DQ 0x000000015a546366
+rk20 :
+DQ 0x00000000f1da05aa
+
+mask:
+dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2:
+dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3:
+dq 0x8080808080808080, 0x8080808080808080
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+;;; func core, ver, snum
+slversion crc32_gzip_refl_by8, 01, 00, 002c
diff --git a/src/isa-l/crc/crc32_gzip_refl_by8_02.asm b/src/isa-l/crc/crc32_gzip_refl_by8_02.asm
new file mode 100644
index 000000000..712fe87aa
--- /dev/null
+++ b/src/isa-l/crc/crc32_gzip_refl_by8_02.asm
@@ -0,0 +1,556 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; UINT32 crc32_gzip_refl_by8_02(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://download.intel.com/design/intarch/papers/323102.pdf
+;
+;
+; sample yasm command line:
+; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip_refl_by8
+;
+; As explained here:
+; http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
+; CRC-32 checksum is described in RFC 1952
+; Implementing RFC 1952 CRC:
+; http://www.ietf.org/rfc/rfc1952.txt
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global crc32_gzip_refl_by8_02, function
+crc32_gzip_refl_by8_02:
+ endbranch
+ not arg1_low32
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+ jl .less_than_256
+
+ ; load the initial crc value
+ vmovd xmm10, arg1_low32 ; initial crc
+
+ ; receive the initial 64B data, xor the initial crc value
+ vmovdqu xmm0, [arg2+16*0]
+ vmovdqu xmm1, [arg2+16*1]
+ vmovdqu xmm2, [arg2+16*2]
+ vmovdqu xmm3, [arg2+16*3]
+ vmovdqu xmm4, [arg2+16*4]
+ vmovdqu xmm5, [arg2+16*5]
+ vmovdqu xmm6, [arg2+16*6]
+ vmovdqu xmm7, [arg2+16*7]
+
+ ; XOR the initial_crc value
+ vpxor xmm0, xmm10
+ vmovdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+.fold_128_B_loop:
+ add arg2, 128
+ prefetchnta [arg2+fetch_dist+0]
+ vmovdqu xmm9, [arg2+16*0]
+ vmovdqu xmm12, [arg2+16*1]
+ vpclmulqdq xmm8, xmm0, xmm10, 0x10
+ vpclmulqdq xmm0, xmm0, xmm10 , 0x1
+ vpclmulqdq xmm13, xmm1, xmm10, 0x10
+ vpclmulqdq xmm1, xmm1, xmm10 , 0x1
+ vpxor xmm0, xmm9
+ vxorps xmm0, xmm8
+ vpxor xmm1, xmm12
+ vxorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ vmovdqu xmm9, [arg2+16*2]
+ vmovdqu xmm12, [arg2+16*3]
+ vpclmulqdq xmm8, xmm2, xmm10, 0x10
+ vpclmulqdq xmm2, xmm2, xmm10 , 0x1
+ vpclmulqdq xmm13, xmm3, xmm10, 0x10
+ vpclmulqdq xmm3, xmm3, xmm10 , 0x1
+ vpxor xmm2, xmm9
+ vxorps xmm2, xmm8
+ vpxor xmm3, xmm12
+ vxorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ vmovdqu xmm9, [arg2+16*4]
+ vmovdqu xmm12, [arg2+16*5]
+ vpclmulqdq xmm8, xmm4, xmm10, 0x10
+ vpclmulqdq xmm4, xmm4, xmm10 , 0x1
+ vpclmulqdq xmm13, xmm5, xmm10, 0x10
+ vpclmulqdq xmm5, xmm5, xmm10 , 0x1
+ vpxor xmm4, xmm9
+ vxorps xmm4, xmm8
+ vpxor xmm5, xmm12
+ vxorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ vmovdqu xmm9, [arg2+16*6]
+ vmovdqu xmm12, [arg2+16*7]
+ vpclmulqdq xmm8, xmm6, xmm10, 0x10
+ vpclmulqdq xmm6, xmm6, xmm10 , 0x1
+ vpclmulqdq xmm13, xmm7, xmm10, 0x10
+ vpclmulqdq xmm7, xmm7, xmm10 , 0x1
+ vpxor xmm6, xmm9
+ vxorps xmm6, xmm8
+ vpxor xmm7, xmm12
+ vxorps xmm7, xmm13
+
+ sub arg3, 128
+ jge .fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+ vmovdqa xmm10, [rk9]
+ vpclmulqdq xmm8, xmm0, xmm10, 0x1
+ vpclmulqdq xmm0, xmm0, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm0
+
+ vmovdqa xmm10, [rk11]
+ vpclmulqdq xmm8, xmm1, xmm10, 0x1
+ vpclmulqdq xmm1, xmm1, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm1
+
+ vmovdqa xmm10, [rk13]
+ vpclmulqdq xmm8, xmm2, xmm10, 0x1
+ vpclmulqdq xmm2, xmm2, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+ vmovdqa xmm10, [rk15]
+ vpclmulqdq xmm8, xmm3, xmm10, 0x1
+ vpclmulqdq xmm3, xmm3, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm3
+
+ vmovdqa xmm10, [rk17]
+ vpclmulqdq xmm8, xmm4, xmm10, 0x1
+ vpclmulqdq xmm4, xmm4, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm4
+
+ vmovdqa xmm10, [rk19]
+ vpclmulqdq xmm8, xmm5, xmm10, 0x1
+ vpclmulqdq xmm5, xmm5, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm5
+
+ vmovdqa xmm10, [rk1]
+ vpclmulqdq xmm8, xmm6, xmm10, 0x1
+ vpclmulqdq xmm6, xmm6, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm6
+
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl .final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+.16B_reduction_loop:
+ vpclmulqdq xmm8, xmm7, xmm10, 0x1
+ vpclmulqdq xmm7, xmm7, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vmovdqu xmm0, [arg2]
+ vpxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge .16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+.final_reduction_for_128:
+ add arg3, 16
+ je .128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset
+ ; the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+.get_last_two_xmms:
+
+ vmovdqa xmm2, xmm7
+ vmovdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ vmovdqu xmm0, [rax]
+
+ vpshufb xmm7, xmm0
+ vpxor xmm0, [mask3]
+ vpshufb xmm2, xmm0
+
+ vpblendvb xmm2, xmm2, xmm1, xmm0
+ ;;;;;;;;;;
+ vpclmulqdq xmm8, xmm7, xmm10, 0x1
+ vpclmulqdq xmm7, xmm7, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+.128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm10, [rk5]
+ vmovdqa xmm0, xmm7
+
+ ;64b fold
+ vpclmulqdq xmm7, xmm10, 0
+ vpsrldq xmm0, 8
+ vpxor xmm7, xmm0
+
+ ;32b fold
+ vmovdqa xmm0, xmm7
+ vpslldq xmm7, 4
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm0
+
+
+ ;barrett reduction
+.barrett:
+ vpand xmm7, [mask2]
+ vmovdqa xmm1, xmm7
+ vmovdqa xmm2, xmm7
+ vmovdqa xmm10, [rk7]
+
+ vpclmulqdq xmm7, xmm10, 0
+ vpxor xmm7, xmm2
+ vpand xmm7, [mask]
+ vmovdqa xmm2, xmm7
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm2
+ vpxor xmm7, xmm1
+ vpextrd eax, xmm7, 2
+
+.cleanup:
+ not eax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+.less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl .less_than_32
+
+ ; if there is, load the constants
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp .16B_reduction_loop
+
+
+align 16
+.less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je .cleanup
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+
+ cmp arg3, 16
+ je .exact_16_left
+ jl .less_than_16_left
+
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp .get_last_two_xmms
+
+align 16
+.less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl .only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl .less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+.less_than_8_left:
+
+ cmp arg3, 4
+ jl .less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+.less_than_4_left:
+
+ cmp arg3, 2
+ jl .less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+.less_than_2_left:
+ cmp arg3, 1
+ jl .zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+.zero_left:
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+ vmovdqu xmm0, [rax + r9]
+ vpshufb xmm7,xmm0
+ jmp .128_done
+
+align 16
+.exact_16_left:
+ vmovdqu xmm7, [arg2]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ jmp .128_done
+
+.only_less_than_4:
+ cmp arg3, 3
+ jl .only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpslldq xmm7, 5
+ jmp .barrett
+
+.only_less_than_3:
+ cmp arg3, 2
+ jl .only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpslldq xmm7, 6
+ jmp .barrett
+
+.only_less_than_2:
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpslldq xmm7, 7
+ jmp .barrett
+
+section .data
+
+; precomputed constants
+align 16
+rk1: dq 0x00000000ccaa009e
+rk2: dq 0x00000001751997d0
+rk3: dq 0x000000014a7fe880
+rk4: dq 0x00000001e88ef372
+rk5: dq 0x00000000ccaa009e
+rk6: dq 0x0000000163cd6124
+rk7: dq 0x00000001f7011640
+rk8: dq 0x00000001db710640
+rk9: dq 0x00000001d7cfc6ac
+rk10: dq 0x00000001ea89367e
+rk11: dq 0x000000018cb44e58
+rk12: dq 0x00000000df068dc2
+rk13: dq 0x00000000ae0b5394
+rk14: dq 0x00000001c7569e54
+rk15: dq 0x00000001c6e41596
+rk16: dq 0x0000000154442bd4
+rk17: dq 0x0000000174359406
+rk18: dq 0x000000003db1ecdc
+rk19: dq 0x000000015a546366
+rk20: dq 0x00000000f1da05aa
+
+mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3: dq 0x8080808080808080, 0x8080808080808080
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
diff --git a/src/isa-l/crc/crc32_gzip_refl_perf.c b/src/isa-l/crc/crc32_gzip_refl_perf.c
new file mode 100644
index 000000000..ad3d86fb5
--- /dev/null
+++ b/src/isa-l/crc/crc32_gzip_refl_perf.c
@@ -0,0 +1,91 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+ void *buf;
+ uint32_t crc;
+ struct perf start;
+
+ printf("crc32_gzip_refl_perf:\n");
+
+ if (posix_memalign(&buf, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ memset(buf, 0, TEST_LEN);
+ BENCHMARK(&start, BENCHMARK_TIME, crc = crc32_gzip_refl(TEST_SEED, buf, TEST_LEN));
+ printf("crc32_gzip_refl" TEST_TYPE_STR ": ");
+ perf_print(start, (long long)TEST_LEN);
+
+ printf("finish 0x%x\n", crc);
+
+ printf("crc32_gzip_refl_base_perf:\n");
+ printf("Start timed tests\n");
+ fflush(0);
+
+ BENCHMARK(&start, BENCHMARK_TIME, crc =
+ crc32_gzip_refl_base(TEST_SEED, buf, TEST_LEN));
+ printf("crc32_gzip_refl_base" TEST_TYPE_STR ": ");
+ perf_print(start, (long long)TEST_LEN);
+
+ printf("finish 0x%x\n", crc);
+
+ return 0;
+}
diff --git a/src/isa-l/crc/crc32_ieee_01.asm b/src/isa-l/crc/crc32_ieee_01.asm
new file mode 100644
index 000000000..368261de2
--- /dev/null
+++ b/src/isa-l/crc/crc32_ieee_01.asm
@@ -0,0 +1,656 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; UINT32 crc32_ieee_01(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+align 16
+mk_global crc32_ieee_01, function
+crc32_ieee_01:
+ endbranch
+
+ not arg1_low32 ;~init_crc
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movd xmm10, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 12
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x1
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm7
+
+ pand xmm0, [mask2]
+
+ psrldq xmm7, 12
+ pclmulqdq xmm7, xmm10, 0x10
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+ pclmulqdq xmm7, xmm10, 0x01
+ pslldq xmm7, 4
+ pclmulqdq xmm7, xmm10, 0x11
+
+ pslldq xmm7, 4
+ pxor xmm7, xmm0
+ pextrd eax, xmm7,1
+
+_cleanup:
+ not eax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp,VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movd xmm0, arg1_low32 ; get the initial crc value
+ pslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ psrldq xmm7, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+align 16
+
+rk1 :
+DQ 0xf200aa6600000000
+rk2 :
+DQ 0x17d3315d00000000
+rk3 :
+DQ 0x022ffca500000000
+rk4 :
+DQ 0x9d9ee22f00000000
+rk5 :
+DQ 0xf200aa6600000000
+rk6 :
+DQ 0x490d678d00000000
+rk7 :
+DQ 0x0000000104d101df
+rk8 :
+DQ 0x0000000104c11db7
+rk9 :
+DQ 0x6ac7e7d700000000
+rk10 :
+DQ 0xfcd922af00000000
+rk11 :
+DQ 0x34e45a6300000000
+rk12 :
+DQ 0x8762c1f600000000
+rk13 :
+DQ 0x5395a0ea00000000
+rk14 :
+DQ 0x54f2d5c700000000
+rk15 :
+DQ 0xd3504ec700000000
+rk16 :
+DQ 0x57a8445500000000
+rk17 :
+DQ 0xc053585d00000000
+rk18 :
+DQ 0x766f1b7800000000
+rk19 :
+DQ 0xcd8c54b500000000
+rk20 :
+DQ 0xab40b71e00000000
+
+
+
+
+
+
+
+
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+;;; func core, ver, snum
+slversion crc32_ieee_01, 01, 06, 0011
+
diff --git a/src/isa-l/crc/crc32_ieee_02.asm b/src/isa-l/crc/crc32_ieee_02.asm
new file mode 100644
index 000000000..95d53e8a3
--- /dev/null
+++ b/src/isa-l/crc/crc32_ieee_02.asm
@@ -0,0 +1,652 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; UINT32 crc32_ieee_02(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+align 16
+mk_global crc32_ieee_02, function
+crc32_ieee_02:
+ endbranch
+
+ not arg1_low32 ;~init_crc
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ vmovd xmm10, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ vpslldq xmm10, 12
+
+ vmovdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ vmovdqu xmm0, [arg2+16*0]
+ vmovdqu xmm1, [arg2+16*1]
+ vmovdqu xmm2, [arg2+16*2]
+ vmovdqu xmm3, [arg2+16*3]
+ vmovdqu xmm4, [arg2+16*4]
+ vmovdqu xmm5, [arg2+16*5]
+ vmovdqu xmm6, [arg2+16*6]
+ vmovdqu xmm7, [arg2+16*7]
+
+ vpshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ vpxor xmm0, xmm10
+ vpshufb xmm1, xmm11
+ vpshufb xmm2, xmm11
+ vpshufb xmm3, xmm11
+ vpshufb xmm4, xmm11
+ vpshufb xmm5, xmm11
+ vpshufb xmm6, xmm11
+ vpshufb xmm7, xmm11
+
+ vmovdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ vmovdqu xmm9, [arg2+16*0]
+ vmovdqu xmm12, [arg2+16*1]
+ vpshufb xmm9, xmm11
+ vpshufb xmm12, xmm11
+ vmovdqa xmm8, xmm0
+ vmovdqa xmm13, xmm1
+ vpclmulqdq xmm0, xmm10, 0x0
+ vpclmulqdq xmm8, xmm10 , 0x11
+ vpclmulqdq xmm1, xmm10, 0x0
+ vpclmulqdq xmm13, xmm10 , 0x11
+ vpxor xmm0, xmm9
+ vxorps xmm0, xmm8
+ vpxor xmm1, xmm12
+ vxorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ vmovdqu xmm9, [arg2+16*2]
+ vmovdqu xmm12, [arg2+16*3]
+ vpshufb xmm9, xmm11
+ vpshufb xmm12, xmm11
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm13, xmm3
+ vpclmulqdq xmm2, xmm10, 0x0
+ vpclmulqdq xmm8, xmm10 , 0x11
+ vpclmulqdq xmm3, xmm10, 0x0
+ vpclmulqdq xmm13, xmm10 , 0x11
+ vpxor xmm2, xmm9
+ vxorps xmm2, xmm8
+ vpxor xmm3, xmm12
+ vxorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ vmovdqu xmm9, [arg2+16*4]
+ vmovdqu xmm12, [arg2+16*5]
+ vpshufb xmm9, xmm11
+ vpshufb xmm12, xmm11
+ vmovdqa xmm8, xmm4
+ vmovdqa xmm13, xmm5
+ vpclmulqdq xmm4, xmm10, 0x0
+ vpclmulqdq xmm8, xmm10 , 0x11
+ vpclmulqdq xmm5, xmm10, 0x0
+ vpclmulqdq xmm13, xmm10 , 0x11
+ vpxor xmm4, xmm9
+ vxorps xmm4, xmm8
+ vpxor xmm5, xmm12
+ vxorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ vmovdqu xmm9, [arg2+16*6]
+ vmovdqu xmm12, [arg2+16*7]
+ vpshufb xmm9, xmm11
+ vpshufb xmm12, xmm11
+ vmovdqa xmm8, xmm6
+ vmovdqa xmm13, xmm7
+ vpclmulqdq xmm6, xmm10, 0x0
+ vpclmulqdq xmm8, xmm10 , 0x11
+ vpclmulqdq xmm7, xmm10, 0x0
+ vpclmulqdq xmm13, xmm10 , 0x11
+ vpxor xmm6, xmm9
+ vxorps xmm6, xmm8
+ vpxor xmm7, xmm12
+ vxorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
+ ; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ vmovdqa xmm10, [rk9]
+ vmovdqa xmm8, xmm0
+ vpclmulqdq xmm0, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm0
+
+ vmovdqa xmm10, [rk11]
+ vmovdqa xmm8, xmm1
+ vpclmulqdq xmm1, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm1
+
+ vmovdqa xmm10, [rk13]
+ vmovdqa xmm8, xmm2
+ vpclmulqdq xmm2, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+ vmovdqa xmm10, [rk15]
+ vmovdqa xmm8, xmm3
+ vpclmulqdq xmm3, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm3
+
+ vmovdqa xmm10, [rk17]
+ vmovdqa xmm8, xmm4
+ vpclmulqdq xmm4, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm4
+
+ vmovdqa xmm10, [rk19]
+ vmovdqa xmm8, xmm5
+ vpclmulqdq xmm5, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vxorps xmm7, xmm5
+
+ vmovdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ vmovdqa xmm8, xmm6
+ vpclmulqdq xmm6, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ vmovdqa xmm8, xmm7
+ vpclmulqdq xmm7, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vmovdqu xmm0, [arg2]
+ vpshufb xmm0, xmm11
+ vpxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ vmovdqa xmm2, xmm7
+
+ vmovdqu xmm1, [arg2 - 16 + arg3]
+ vpshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ vmovdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ vpshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ vpxor xmm0, [mask1]
+ vpshufb xmm7, xmm0
+ vpblendvb xmm1, xmm1, xmm2, xmm0
+
+ ; fold 16 Bytes
+ vmovdqa xmm2, xmm1
+ vmovdqa xmm8, xmm7
+ vpclmulqdq xmm7, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ vmovdqa xmm0, xmm7
+
+ ;64b fold
+ vpclmulqdq xmm7, xmm10, 0x1
+ vpslldq xmm0, 8
+ vpxor xmm7, xmm0
+
+ ;32b fold
+ vmovdqa xmm0, xmm7
+
+ vpand xmm0, [mask2]
+
+ vpsrldq xmm7, 12
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ vmovdqa xmm0, xmm7
+ vpclmulqdq xmm7, xmm10, 0x01
+ vpslldq xmm7, 4
+ vpclmulqdq xmm7, xmm10, 0x11
+
+ vpslldq xmm7, 4
+ vpxor xmm7, xmm0
+ vpextrd eax, xmm7,1
+
+_cleanup:
+ not eax
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp,VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ vmovdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm11 ; byte-reflect the plaintext
+ vpxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ vmovdqa xmm11, [SHUF_MASK]
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm11 ; byte-reflect the plaintext
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ vmovdqu xmm0, [rax]
+ vpxor xmm0, [mask1]
+
+ vpshufb xmm7, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ vmovdqu xmm7, [arg2]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 6
+
+ jmp _barrett
+_only_less_than_2:
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm11
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 7
+
+ jmp _barrett
+
+section .data
+
+; precomputed constants
+align 16
+
+rk1 :
+DQ 0xf200aa6600000000
+rk2 :
+DQ 0x17d3315d00000000
+rk3 :
+DQ 0x022ffca500000000
+rk4 :
+DQ 0x9d9ee22f00000000
+rk5 :
+DQ 0xf200aa6600000000
+rk6 :
+DQ 0x490d678d00000000
+rk7 :
+DQ 0x0000000104d101df
+rk8 :
+DQ 0x0000000104c11db7
+rk9 :
+DQ 0x6ac7e7d700000000
+rk10 :
+DQ 0xfcd922af00000000
+rk11 :
+DQ 0x34e45a6300000000
+rk12 :
+DQ 0x8762c1f600000000
+rk13 :
+DQ 0x5395a0ea00000000
+rk14 :
+DQ 0x54f2d5c700000000
+rk15 :
+DQ 0xd3504ec700000000
+rk16 :
+DQ 0x57a8445500000000
+rk17 :
+DQ 0xc053585d00000000
+rk18 :
+DQ 0x766f1b7800000000
+rk19 :
+DQ 0xcd8c54b500000000
+rk20 :
+DQ 0xab40b71e00000000
+
+
+
+
+
+
+
+
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
diff --git a/src/isa-l/crc/crc32_ieee_by16_10.asm b/src/isa-l/crc/crc32_ieee_by16_10.asm
new file mode 100644
index 000000000..5c3f52a93
--- /dev/null
+++ b/src/isa-l/crc/crc32_ieee_by16_10.asm
@@ -0,0 +1,585 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; UINT32 crc32_gzip_refl_by16_10(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+;
+;
+
+%include "reg_sizes.asm"
+
+%ifndef FUNCTION_NAME
+%define FUNCTION_NAME crc32_ieee_by16_10
+%endif
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*12+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global FUNCTION_NAME, function
+FUNCTION_NAME:
+ endbranch
+
+ not arg1_low32
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
+%endif
+
+ vbroadcasti32x4 zmm18, [SHUF_MASK]
+ cmp arg3, 256
+ jl .less_than_256
+
+ ; load the initial crc value
+ vmovd xmm10, arg1_low32 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ vpslldq xmm10, 12
+
+ ; receive the initial 64B data, xor the initial crc value
+ vmovdqu8 zmm0, [arg2+16*0]
+ vmovdqu8 zmm4, [arg2+16*4]
+ vpshufb zmm0, zmm0, zmm18
+ vpshufb zmm4, zmm4, zmm18
+ vpxorq zmm0, zmm10
+ vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+
+ sub arg3, 256
+ cmp arg3, 256
+ jl .fold_128_B_loop
+
+ vmovdqu8 zmm7, [arg2+16*8]
+ vmovdqu8 zmm8, [arg2+16*12]
+ vpshufb zmm7, zmm7, zmm18
+ vpshufb zmm8, zmm8, zmm18
+ vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
+ sub arg3, 256
+
+.fold_256_B_loop:
+ add arg2, 256
+ vmovdqu8 zmm3, [arg2+16*0]
+ vpshufb zmm3, zmm3, zmm18
+ vpclmulqdq zmm1, zmm0, zmm16, 0x00
+ vpclmulqdq zmm2, zmm0, zmm16, 0x11
+ vpxorq zmm0, zmm1, zmm2
+ vpxorq zmm0, zmm0, zmm3
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpshufb zmm9, zmm9, zmm18
+ vpclmulqdq zmm5, zmm4, zmm16, 0x00
+ vpclmulqdq zmm6, zmm4, zmm16, 0x11
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ vmovdqu8 zmm11, [arg2+16*8]
+ vpshufb zmm11, zmm11, zmm18
+ vpclmulqdq zmm12, zmm7, zmm16, 0x00
+ vpclmulqdq zmm13, zmm7, zmm16, 0x11
+ vpxorq zmm7, zmm12, zmm13
+ vpxorq zmm7, zmm7, zmm11
+
+ vmovdqu8 zmm17, [arg2+16*12]
+ vpshufb zmm17, zmm17, zmm18
+ vpclmulqdq zmm14, zmm8, zmm16, 0x00
+ vpclmulqdq zmm15, zmm8, zmm16, 0x11
+ vpxorq zmm8, zmm14, zmm15
+ vpxorq zmm8, zmm8, zmm17
+
+ sub arg3, 256
+ jge .fold_256_B_loop
+
+ ;; Fold 256 into 128
+ add arg2, 256
+ vpclmulqdq zmm1, zmm0, zmm10, 0x00
+ vpclmulqdq zmm2, zmm0, zmm10, 0x11
+ vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
+
+ vpclmulqdq zmm5, zmm4, zmm10, 0x00
+ vpclmulqdq zmm6, zmm4, zmm10, 0x11
+ vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
+
+ vmovdqa32 zmm0, zmm7
+ vmovdqa32 zmm4, zmm8
+
+ add arg3, 128
+ jmp .fold_128_B_register
+
+
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+.fold_128_B_loop:
+ add arg2, 128
+ vmovdqu8 zmm8, [arg2+16*0]
+ vpshufb zmm8, zmm8, zmm18
+ vpclmulqdq zmm2, zmm0, zmm10, 0x00
+ vpclmulqdq zmm1, zmm0, zmm10, 0x11
+ vpxorq zmm0, zmm2, zmm1
+ vpxorq zmm0, zmm0, zmm8
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpshufb zmm9, zmm9, zmm18
+ vpclmulqdq zmm5, zmm4, zmm10, 0x00
+ vpclmulqdq zmm6, zmm4, zmm10, 0x11
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ sub arg3, 128
+ jge .fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+.fold_128_B_register:
+ ; fold the 8 128b parts into 1 xmm register with different constants
+ vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
+ vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
+ vpclmulqdq zmm1, zmm0, zmm16, 0x00
+ vpclmulqdq zmm2, zmm0, zmm16, 0x11
+ vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
+
+ vpclmulqdq zmm5, zmm4, zmm11, 0x00
+ vpclmulqdq zmm6, zmm4, zmm11, 0x11
+ vmovdqa xmm10, [rk1] ; Needed later in reduction loop
+ vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
+ vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
+
+ vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
+ vpxorq ymm8, ymm8, ymm1
+ vextracti64x2 xmm5, ymm8, 1
+ vpxorq xmm7, xmm5, xmm8
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl .final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+.16B_reduction_loop:
+ vpclmulqdq xmm8, xmm7, xmm10, 0x11
+ vpclmulqdq xmm7, xmm7, xmm10, 0x00
+ vpxor xmm7, xmm8
+ vmovdqu xmm0, [arg2]
+ vpshufb xmm0, xmm0, xmm18
+ vpxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge .16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+.final_reduction_for_128:
+ add arg3, 16
+ je .128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset
+ ; the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+.get_last_two_xmms:
+
+ vmovdqa xmm2, xmm7
+ vmovdqu xmm1, [arg2 - 16 + arg3]
+ vpshufb xmm1, xmm18
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ vmovdqu xmm0, [rax]
+
+ vpshufb xmm2, xmm0
+ vpxor xmm0, [mask1]
+ vpshufb xmm7, xmm0
+ vpblendvb xmm1, xmm1, xmm2, xmm0
+
+ vpclmulqdq xmm8, xmm7, xmm10, 0x11
+ vpclmulqdq xmm7, xmm7, xmm10, 0x00
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm1
+
+.128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm10, [rk5]
+ vmovdqa xmm0, xmm7
+
+ ;64b fold
+ vpclmulqdq xmm7, xmm10, 0x01 ; H*L
+ vpslldq xmm0, 8
+ vpxor xmm7, xmm0
+
+ ;32b fold
+ vmovdqa xmm0, xmm7
+ vpand xmm0, [mask2]
+ vpsrldq xmm7, 12
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm0
+
+ ;barrett reduction
+.barrett:
+ vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ vmovdqa xmm0, xmm7
+ vpclmulqdq xmm7, xmm10, 0x01
+ vpslldq xmm7, 4
+ vpclmulqdq xmm7, xmm10, 0x11
+
+ vpslldq xmm7, 4
+ vpxor xmm7, xmm0
+ vpextrd eax, xmm7, 1
+
+.cleanup:
+ not eax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+.less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl .less_than_32
+
+ ; if there is, load the constants
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm18 ; byte-reflect the plaintext
+ vpxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp .16B_reduction_loop
+
+
+align 16
+.less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je .cleanup
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vpslldq xmm0, 12 ; align it to its correct place
+
+ cmp arg3, 16
+ je .exact_16_left
+ jl .less_than_16_left
+
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp .get_last_two_xmms
+
+align 16
+.less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl .only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl .less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+.less_than_8_left:
+
+ cmp arg3, 4
+ jl .less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+.less_than_4_left:
+
+ cmp arg3, 2
+ jl .less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+.less_than_2_left:
+ cmp arg3, 1
+ jl .zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+.zero_left:
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+ vmovdqu xmm0, [rax]
+ vpxor xmm0, [mask1]
+
+ vpshufb xmm7,xmm0
+ jmp .128_done
+
+align 16
+.exact_16_left:
+ vmovdqu xmm7, [arg2]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ jmp .128_done
+
+.only_less_than_4:
+ cmp arg3, 3
+ jl .only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 5
+ jmp .barrett
+
+.only_less_than_3:
+ cmp arg3, 2
+ jl .only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 6
+ jmp .barrett
+
+.only_less_than_2:
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpsrldq xmm7, 7
+ jmp .barrett
+
+section .data
+align 32
+
+%ifndef USE_CONSTS
+; precomputed constants
+rk_1: dq 0x1851689900000000
+rk_2: dq 0xa3dc855100000000
+rk1: dq 0xf200aa6600000000
+rk2: dq 0x17d3315d00000000
+rk3: dq 0x022ffca500000000
+rk4: dq 0x9d9ee22f00000000
+rk5: dq 0xf200aa6600000000
+rk6: dq 0x490d678d00000000
+rk7: dq 0x0000000104d101df
+rk8: dq 0x0000000104c11db7
+rk9: dq 0x6ac7e7d700000000
+rk10: dq 0xfcd922af00000000
+rk11: dq 0x34e45a6300000000
+rk12: dq 0x8762c1f600000000
+rk13: dq 0x5395a0ea00000000
+rk14: dq 0x54f2d5c700000000
+rk15: dq 0xd3504ec700000000
+rk16: dq 0x57a8445500000000
+rk17: dq 0xc053585d00000000
+rk18: dq 0x766f1b7800000000
+rk19: dq 0xcd8c54b500000000
+rk20: dq 0xab40b71e00000000
+
+rk_1b: dq 0xf200aa6600000000
+rk_2b: dq 0x17d3315d00000000
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+%else
+INCLUDE_CONSTS
+%endif
+
+mask1: dq 0x8080808080808080, 0x8080808080808080
+mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+
+SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x8080808080808080
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_ %+ FUNCTION_NAME
+no_ %+ FUNCTION_NAME %+ :
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/isa-l/crc/crc32_ieee_by4.asm b/src/isa-l/crc/crc32_ieee_by4.asm
new file mode 100644
index 000000000..f43264095
--- /dev/null
+++ b/src/isa-l/crc/crc32_ieee_by4.asm
@@ -0,0 +1,566 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Function API:
+; UINT32 crc32_ieee_by4(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://download.intel.com/design/intarch/papers/323102.pdf
+;
+
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+
+ %xdefine arg1_low32 ecx
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 edi
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*4+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global crc32_ieee_by4, function
+crc32_ieee_by4:
+ endbranch
+
+ not arg1_low32
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0],xmm6
+ movdqa [rsp + XMM_SAVE + 16*1],xmm7
+%endif
+
+ ; check if smaller than 128B
+ cmp arg3, 128
+ jl _less_than_128
+
+
+
+ ; load the initial crc value
+ movd xmm6, arg1_low32 ; initial crc
+ ; crc value does not need to be byte-reflected, but it needs to be
+ ; moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial
+ ; crc at correct place.
+ pslldq xmm6, 12
+
+
+
+ movdqa xmm7, [SHUF_MASK]
+ ; receive the initial 64B data, xor the initial crc value
+ movdqu xmm0, [arg2]
+ movdqu xmm1, [arg2+16]
+ movdqu xmm2, [arg2+32]
+ movdqu xmm3, [arg2+48]
+
+
+
+ pshufb xmm0, xmm7
+ ; XOR the initial_crc value
+ pxor xmm0, xmm6
+ pshufb xmm1, xmm7
+ pshufb xmm2, xmm7
+ pshufb xmm3, xmm7
+
+ movdqa xmm6, [rk3] ; k3=2^480 mod POLY << 32
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;we subtract 128 instead of 64 to save one instruction from the loop
+ sub arg3, 128
+
+ ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ ; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
+ ; have 64+y Bytes of buffer
+
+
+ ; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
+_fold_64_B_loop:
+
+ ;update the buffer pointer
+ add arg2, 64
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+
+ pclmulqdq xmm0, xmm6 , 0x11
+ pclmulqdq xmm1, xmm6 , 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pclmulqdq xmm5, xmm6, 0x0
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ movdqu xmm4, [arg2]
+ movdqu xmm5, [arg2+16]
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+
+ movdqu xmm4, [arg2+32]
+ movdqu xmm5, [arg2+48]
+ pshufb xmm4, xmm7
+ pshufb xmm5, xmm7
+
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+
+ sub arg3, 64
+
+ ; check if there is another 64B in the buffer to be able to fold
+ jge _fold_64_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ add arg2, 64
+ ;at this point, the arg2 is pointing at the last y Bytes of the buffer
+ ; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
+
+
+ movdqa xmm6, [rk1] ;k1
+
+ ; fold the 4 xmm registers to 1 xmm register with different constants
+ movdqa xmm4, xmm0
+ pclmulqdq xmm0, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm1, xmm4
+ xorps xmm1, xmm0
+
+ movdqa xmm4, xmm1
+ pclmulqdq xmm1, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm2, xmm4
+ xorps xmm2, xmm1
+
+ movdqa xmm4, xmm2
+ pclmulqdq xmm2, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+
+ ;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 64-16
+ jl _final_reduction_for_128
+
+; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
+; we can fold 16 bytes at a time if y>=16
+; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm7
+ pxor xmm3, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm3 register
+
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset
+ ; the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm3
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm7
+
+ shl arg3, 4
+ lea rax, [pshufb_shf_table + 15*16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ pshufb xmm2, xmm0
+
+ pxor xmm0, [mask3]
+
+ pshufb xmm3, xmm0
+
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ movdqa xmm2, xmm1
+
+ movdqa xmm4, xmm3
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pclmulqdq xmm4, xmm6, 0x0
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+_128_done:
+
+ movdqa xmm6, [rk5]
+ movdqa xmm0, xmm3
+
+ ;64b fold
+ pclmulqdq xmm3, xmm6, 0x1
+ pslldq xmm0, 8
+ pxor xmm3, xmm0
+
+ ;32b fold
+ movdqa xmm0, xmm3
+
+ pand xmm0, [mask4]
+
+ psrldq xmm3, 12
+ pclmulqdq xmm3, xmm6, 0x10
+ pxor xmm3, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm6, [rk7]
+ movdqa xmm0, xmm3
+ pclmulqdq xmm3, xmm6, 0x01
+ pslldq xmm3, 4
+ pclmulqdq xmm3, xmm6, 0x11
+
+ pslldq xmm3, 4
+ pxor xmm3, xmm0
+ pextrd eax, xmm3,1
+
+_cleanup:
+ not eax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+%endif
+ add rsp,VARIABLE_OFFSET
+
+
+ ret
+
+
+
+
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_128:
+
+ ;check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm7, [SHUF_MASK]
+
+ ;if there is, load the constants
+ movdqa xmm6, [rk1] ;k1
+
+ movd xmm0, arg1_low32
+ pslldq xmm0, 12
+ movdqu xmm3, [arg2]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+
+ ;update the buffer pointer
+ add arg2, 16
+
+ ;update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+
+align 16
+_less_than_32:
+ mov eax, arg1_low32
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm7, [SHUF_MASK]
+
+ movd xmm0, arg1_low32
+ pslldq xmm0, 12
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+ movd xmm0, arg1_low32
+ pslldq xmm0, 12
+ movdqu xmm3, [arg2]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm6, [rk1] ;k1
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+
+
+ cmp arg3, 4
+ jl _only_less_than_4
+
+ mov r9, arg3
+
+
+ cmp arg3, 8
+ jl _less_than_8_left
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ shl r9, 4
+ lea rax, [pshufb_shf_table + 15*16]
+ sub rax, r9
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask3]
+
+ pshufb xmm3, xmm0
+ jmp _128_done
+
+align 16
+_exact_16_left:
+ movdqu xmm3, [arg2]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp arg3, 3
+ jl _only_less_than_3
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ psrldq xmm3, 5
+
+ jmp _barrett
+_only_less_than_3:
+ cmp arg3, 2
+ jl _only_less_than_2
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ psrldq xmm3, 6
+
+ jmp _barrett
+_only_less_than_2:
+ mov al, [arg2]
+ mov [r11], al
+
+ movdqa xmm3, [rsp]
+ pshufb xmm3, xmm7
+ pxor xmm3, xmm0
+
+ psrldq xmm3, 7
+
+ jmp _barrett
+; precomputed constants
+section .data
+
+align 16
+rk1:
+DQ 0xf200aa6600000000
+rk2:
+DQ 0x17d3315d00000000
+rk3:
+DQ 0xd3504ec700000000
+rk4:
+DQ 0x57a8445500000000
+rk5:
+DQ 0xf200aa6600000000
+rk6:
+DQ 0x490d678d00000000
+rk7:
+DQ 0x0000000104d101df
+rk8:
+DQ 0x0000000104c11db7
+mask:
+dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2:
+dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3:
+dq 0x8080808080808080, 0x8080808080808080
+mask4:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+ align 32
+pshufb_shf_table:
+
+ dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+
+ dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+
+ dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+
+ dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+
+ dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+
+ dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+
+ dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+
+ dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+
+ dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+
+ dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+
+ dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+
+ dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+
+ dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+
+ dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+
+ dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+
+
+SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+;;; func core, ver, snum
+slversion crc32_ieee_by4, 05, 02, 0017
diff --git a/src/isa-l/crc/crc32_ieee_perf.c b/src/isa-l/crc/crc32_ieee_perf.c
new file mode 100644
index 000000000..f6ffbbe44
--- /dev/null
+++ b/src/isa-l/crc/crc32_ieee_perf.c
@@ -0,0 +1,79 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+ void *buf;
+ uint32_t crc;
+ struct perf start;
+
+ printf("crc32_ieee_perf:\n");
+
+ if (posix_memalign(&buf, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ memset(buf, 0, TEST_LEN);
+ BENCHMARK(&start, BENCHMARK_TIME, crc = crc32_ieee(TEST_SEED, buf, TEST_LEN));
+ printf("crc32_ieee" TEST_TYPE_STR ": ");
+ perf_print(start, (long long)TEST_LEN);
+
+ printf("finish 0x%x\n", crc);
+ return 0;
+}
diff --git a/src/isa-l/crc/crc32_iscsi_00.asm b/src/isa-l/crc/crc32_iscsi_00.asm
new file mode 100644
index 000000000..1a5e02928
--- /dev/null
+++ b/src/isa-l/crc/crc32_iscsi_00.asm
@@ -0,0 +1,672 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function to compute iscsi CRC32 with table-based recombination
+; crc done "by 3" with block sizes 1920, 960, 480, 240
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+default rel
+; crcB3 MACRO to implement crc32 on 3 %%bSize-byte blocks
+%macro crcB3 3
+%define %%bSize %1 ; 1/3 of buffer size
+%define %%td2 %2 ; table offset for crc0 (2/3 of buffer)
+%define %%td1 %3 ; table offset for crc1 (1/3 of buffer)
+
+%IF %%bSize=640
+ sub len, %%bSize*3
+ js %%crcB3_end ;; jump to next level if 3*blockSize > len
+%ELSE
+ cmp len, %%bSize*3
+ jnae %%crcB3_end ;; jump to next level if 3*blockSize > len
+%ENDIF
+ ;;;;;; Calculate CRC of 3 blocks of the buffer ;;;;;;
+%%crcB3_loop:
+ ;; rax = crc0 = initial crc
+ xor rbx, rbx ;; rbx = crc1 = 0;
+ xor r10, r10 ;; r10 = crc2 = 0;
+
+ cmp len, %%bSize*3*2
+ jbe %%non_prefetch
+
+ %assign i 0
+ %rep %%bSize/8 - 1
+ %if i < %%bSize*3/4
+ prefetchnta [bufptmp+ %%bSize*3 +i*4]
+ %endif
+ crc32 rax, qword [bufptmp+i + 0*%%bSize] ;; update crc0
+ crc32 rbx, qword [bufptmp+i + 1*%%bSize] ;; update crc1
+ crc32 r10, qword [bufptmp+i + 2*%%bSize] ;; update crc2
+ %assign i (i+8)
+ %endrep
+ jmp %%next %+ %1
+
+%%non_prefetch:
+ %assign i 0
+ %rep %%bSize/8 - 1
+ crc32 rax, qword [bufptmp+i + 0*%%bSize] ;; update crc0
+ crc32 rbx, qword [bufptmp+i + 1*%%bSize] ;; update crc1
+ crc32 r10, qword [bufptmp+i + 2*%%bSize] ;; update crc2
+ %assign i (i+8)
+ %endrep
+
+%%next %+ %1:
+ crc32 rax, qword [bufptmp+i + 0*%%bSize] ;; update crc0
+ crc32 rbx, qword [bufptmp+i + 1*%%bSize] ;; update crc1
+; SKIP ;crc32 r10, [bufptmp+i + 2*%%bSize] ;; update crc2
+
+ ; merge in crc0
+ movzx bufp_dw, al
+ mov r9d, [crc_init + bufp*4 + %%td2]
+ movzx bufp_dw, ah
+ shr eax, 16
+ mov r11d, [crc_init + bufp*4 + %%td2]
+ shl r11, 8
+ xor r9, r11
+
+ movzx bufp_dw, al
+ mov r11d, [crc_init + bufp*4 + %%td2]
+ movzx bufp_dw, ah
+ shl r11, 16
+ xor r9, r11
+ mov r11d, [crc_init + bufp*4 + %%td2]
+ shl r11, 24
+ xor r9, r11
+
+ ; merge in crc1
+
+ movzx bufp_dw, bl
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ movzx bufp_dw, bh
+ shr ebx, 16
+ xor r9, r11
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ shl r11, 8
+ xor r9, r11
+
+ movzx bufp_dw, bl
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ movzx bufp_dw, bh
+ shl r11, 16
+ xor r9, r11
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ shl r11, 24
+ xor r9, r11
+
+ xor r9, [bufptmp+i + 2*%%bSize]
+ crc32 r10, r9
+ mov rax, r10
+
+ add bufptmp, %%bSize*3 ;; move to next block
+ sub len, %%bSize*3
+%IF %%bSize=640
+ jns %%crcB3_loop
+%ENDIF
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%crcB3_end:
+%IF %%bSize=640
+ add len, %%bSize*3
+%ENDIF
+ je do_return ;; return if remaining data is zero
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ISCSI CRC 32 Implementation with crc32 Instruction
+
+;;; unsigned int crc32_iscsi_00(unsigned char * buffer, int len, unsigned int crc_init);
+;;;
+;;; *buf = rcx
+;;; len = rdx
+;;; crc_init = r8
+;;;
+
+mk_global crc32_iscsi_00, function
+crc32_iscsi_00:
+ endbranch
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define bufp rdi
+%define bufp_dw edi
+%define bufp_w di
+%define bufp_b dil
+%define bufptmp rcx
+%define block_0 rcx
+%define block_1 r8
+%define block_2 r11
+%define len rsi
+%define len_dw esi
+%define len_w si
+%define len_b sil
+%define crc_init rdx
+%define crc_init_dw edx
+%else
+%define bufp rcx
+%define bufp_dw ecx
+%define bufp_w cx
+%define bufp_b cl
+%define bufptmp rdi
+%define block_0 rdi
+%define block_1 rsi
+%define block_2 r11
+%define len rdx
+%define len_dw edx
+%define len_w dx
+%define len_b dl
+%define crc_init r8
+%define crc_init_dw r8d
+%endif
+
+
+ push rdi
+ push rbx
+
+ mov rax, crc_init ;; rax = crc_init;
+
+ cmp len, 8
+ jb less_than_8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov bufptmp, bufp ;; rdi = *buf
+ neg bufp
+ and bufp, 7 ;; calculate the unalignment
+ ;; amount of the address
+ je proc_block ;; Skip if aligned
+
+ ;;;; Calculate CRC of unaligned bytes of the buffer (if any) ;;;;
+ mov rbx, [bufptmp] ;; load a quadword from the buffer
+ add bufptmp, bufp ;; align buffer pointer for
+ ;; quadword processing
+ sub len, bufp ;; update buffer length
+align_loop:
+ crc32 eax, bl ;; compute crc32 of 1-byte
+ shr rbx, 8 ;; get next byte
+ dec bufp
+ jne align_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 2) BLOCK LEVEL: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+proc_block:
+ cmp len, 240
+ jb bit8
+
+ lea crc_init, [mul_table_72] ;; load table base address
+
+ crcB3 640, 0x1000, 0x0c00 ; 640*3 = 1920 (Tables 1280, 640)
+ crcB3 320, 0x0c00, 0x0800 ; 320*3 = 960 (Tables 640, 320)
+ crcB3 160, 0x0800, 0x0400 ; 160*3 = 480 (Tables 320, 160)
+ crcB3 80, 0x0400, 0x0000 ; 80*3 = 240 (Tables 160, 80)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of rdx are full)
+
+bit8:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit7 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 16
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 128 ;; buf +=64; (next 64 bytes)
+
+bit7:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit6 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 8
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 64 ;; buf +=64; (next 64 bytes)
+bit6:
+ shl len_b, 1 ;; shift-out MSB (bit-6)
+ jnc bit5 ;; jump to bit-5 if bit-6 == 0
+ %assign i 0
+ %rep 4
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 32 ;; buf +=32; (next 32 bytes)
+bit5:
+ shl len_b, 1 ;; shift-out MSB (bit-5)
+ jnc bit4 ;; jump to bit-4 if bit-5 == 0
+ %assign i 0
+ %rep 2
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 16 ;; buf +=16; (next 16 bytes)
+bit4:
+ shl len_b, 1 ;; shift-out MSB (bit-4)
+ jnc bit3 ;; jump to bit-3 if bit-4 == 0
+ crc32 rax, qword [bufptmp] ;; compute crc32 of 8-byte data
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 8 ;; buf +=8; (next 8 bytes)
+bit3:
+ mov rbx, qword [bufptmp] ;; load a 8-bytes from the buffer:
+ shl len_b, 1 ;; shift-out MSB (bit-3)
+ jnc bit2 ;; jump to bit-2 if bit-3 == 0
+ crc32 eax, ebx ;; compute crc32 of 4-byte data
+ je do_return ;; return if remaining data is zero
+ shr rbx, 32 ;; get next 3 bytes
+bit2:
+ shl len_b, 1 ;; shift-out MSB (bit-2)
+ jnc bit1 ;; jump to bit-1 if bit-2 == 0
+ crc32 eax, bx ;; compute crc32 of 2-byte data
+ je do_return ;; return if remaining data is zero
+ shr rbx, 16 ;; next byte
+bit1:
+ test len_b,len_b
+ je do_return
+ crc32 eax, bl ;; compute crc32 of 1-byte data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+do_return:
+
+ pop rbx
+ pop rdi
+ ret
+
+less_than_8:
+ test len,4
+ jz less_than_4
+ crc32 eax, dword[bufp]
+ add bufp,4
+less_than_4:
+ test len,2
+ jz less_than_2
+ crc32 eax, word[bufp]
+ add bufp,2
+less_than_2:
+ test len,1
+ jz do_return
+ crc32 rax, byte[bufp]
+ pop rbx
+ pop bufp
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; global mul_table_72, mul_table_152, mul_table_312, mul_table_632, mul_table_1272
+
+section .data
+align 8
+mul_table_72:
+DD 0x00000000,0x39d3b296,0x73a7652c,0x4a74d7ba
+DD 0xe74eca58,0xde9d78ce,0x94e9af74,0xad3a1de2
+DD 0xcb71e241,0xf2a250d7,0xb8d6876d,0x810535fb
+DD 0x2c3f2819,0x15ec9a8f,0x5f984d35,0x664bffa3
+DD 0x930fb273,0xaadc00e5,0xe0a8d75f,0xd97b65c9
+DD 0x7441782b,0x4d92cabd,0x07e61d07,0x3e35af91
+DD 0x587e5032,0x61ade2a4,0x2bd9351e,0x120a8788
+DD 0xbf309a6a,0x86e328fc,0xcc97ff46,0xf5444dd0
+DD 0x23f31217,0x1a20a081,0x5054773b,0x6987c5ad
+DD 0xc4bdd84f,0xfd6e6ad9,0xb71abd63,0x8ec90ff5
+DD 0xe882f056,0xd15142c0,0x9b25957a,0xa2f627ec
+DD 0x0fcc3a0e,0x361f8898,0x7c6b5f22,0x45b8edb4
+DD 0xb0fca064,0x892f12f2,0xc35bc548,0xfa8877de
+DD 0x57b26a3c,0x6e61d8aa,0x24150f10,0x1dc6bd86
+DD 0x7b8d4225,0x425ef0b3,0x082a2709,0x31f9959f
+DD 0x9cc3887d,0xa5103aeb,0xef64ed51,0xd6b75fc7
+DD 0x47e6242e,0x7e3596b8,0x34414102,0x0d92f394
+DD 0xa0a8ee76,0x997b5ce0,0xd30f8b5a,0xeadc39cc
+DD 0x8c97c66f,0xb54474f9,0xff30a343,0xc6e311d5
+DD 0x6bd90c37,0x520abea1,0x187e691b,0x21addb8d
+DD 0xd4e9965d,0xed3a24cb,0xa74ef371,0x9e9d41e7
+DD 0x33a75c05,0x0a74ee93,0x40003929,0x79d38bbf
+DD 0x1f98741c,0x264bc68a,0x6c3f1130,0x55eca3a6
+DD 0xf8d6be44,0xc1050cd2,0x8b71db68,0xb2a269fe
+DD 0x64153639,0x5dc684af,0x17b25315,0x2e61e183
+DD 0x835bfc61,0xba884ef7,0xf0fc994d,0xc92f2bdb
+DD 0xaf64d478,0x96b766ee,0xdcc3b154,0xe51003c2
+DD 0x482a1e20,0x71f9acb6,0x3b8d7b0c,0x025ec99a
+DD 0xf71a844a,0xcec936dc,0x84bde166,0xbd6e53f0
+DD 0x10544e12,0x2987fc84,0x63f32b3e,0x5a2099a8
+DD 0x3c6b660b,0x05b8d49d,0x4fcc0327,0x761fb1b1
+DD 0xdb25ac53,0xe2f61ec5,0xa882c97f,0x91517be9
+DD 0x8fcc485c,0xb61ffaca,0xfc6b2d70,0xc5b89fe6
+DD 0x68828204,0x51513092,0x1b25e728,0x22f655be
+DD 0x44bdaa1d,0x7d6e188b,0x371acf31,0x0ec97da7
+DD 0xa3f36045,0x9a20d2d3,0xd0540569,0xe987b7ff
+DD 0x1cc3fa2f,0x251048b9,0x6f649f03,0x56b72d95
+DD 0xfb8d3077,0xc25e82e1,0x882a555b,0xb1f9e7cd
+DD 0xd7b2186e,0xee61aaf8,0xa4157d42,0x9dc6cfd4
+DD 0x30fcd236,0x092f60a0,0x435bb71a,0x7a88058c
+DD 0xac3f5a4b,0x95ece8dd,0xdf983f67,0xe64b8df1
+DD 0x4b719013,0x72a22285,0x38d6f53f,0x010547a9
+DD 0x674eb80a,0x5e9d0a9c,0x14e9dd26,0x2d3a6fb0
+DD 0x80007252,0xb9d3c0c4,0xf3a7177e,0xca74a5e8
+DD 0x3f30e838,0x06e35aae,0x4c978d14,0x75443f82
+DD 0xd87e2260,0xe1ad90f6,0xabd9474c,0x920af5da
+DD 0xf4410a79,0xcd92b8ef,0x87e66f55,0xbe35ddc3
+DD 0x130fc021,0x2adc72b7,0x60a8a50d,0x597b179b
+DD 0xc82a6c72,0xf1f9dee4,0xbb8d095e,0x825ebbc8
+DD 0x2f64a62a,0x16b714bc,0x5cc3c306,0x65107190
+DD 0x035b8e33,0x3a883ca5,0x70fceb1f,0x492f5989
+DD 0xe415446b,0xddc6f6fd,0x97b22147,0xae6193d1
+DD 0x5b25de01,0x62f66c97,0x2882bb2d,0x115109bb
+DD 0xbc6b1459,0x85b8a6cf,0xcfcc7175,0xf61fc3e3
+DD 0x90543c40,0xa9878ed6,0xe3f3596c,0xda20ebfa
+DD 0x771af618,0x4ec9448e,0x04bd9334,0x3d6e21a2
+DD 0xebd97e65,0xd20accf3,0x987e1b49,0xa1ada9df
+DD 0x0c97b43d,0x354406ab,0x7f30d111,0x46e36387
+DD 0x20a89c24,0x197b2eb2,0x530ff908,0x6adc4b9e
+DD 0xc7e6567c,0xfe35e4ea,0xb4413350,0x8d9281c6
+DD 0x78d6cc16,0x41057e80,0x0b71a93a,0x32a21bac
+DD 0x9f98064e,0xa64bb4d8,0xec3f6362,0xd5ecd1f4
+DD 0xb3a72e57,0x8a749cc1,0xc0004b7b,0xf9d3f9ed
+DD 0x54e9e40f,0x6d3a5699,0x274e8123,0x1e9d33b5
+
+mul_table_152:
+DD 0x00000000,0x878a92a7,0x0af953bf,0x8d73c118
+DD 0x15f2a77e,0x927835d9,0x1f0bf4c1,0x98816666
+DD 0x2be54efc,0xac6fdc5b,0x211c1d43,0xa6968fe4
+DD 0x3e17e982,0xb99d7b25,0x34eeba3d,0xb364289a
+DD 0x57ca9df8,0xd0400f5f,0x5d33ce47,0xdab95ce0
+DD 0x42383a86,0xc5b2a821,0x48c16939,0xcf4bfb9e
+DD 0x7c2fd304,0xfba541a3,0x76d680bb,0xf15c121c
+DD 0x69dd747a,0xee57e6dd,0x632427c5,0xe4aeb562
+DD 0xaf953bf0,0x281fa957,0xa56c684f,0x22e6fae8
+DD 0xba679c8e,0x3ded0e29,0xb09ecf31,0x37145d96
+DD 0x8470750c,0x03fae7ab,0x8e8926b3,0x0903b414
+DD 0x9182d272,0x160840d5,0x9b7b81cd,0x1cf1136a
+DD 0xf85fa608,0x7fd534af,0xf2a6f5b7,0x752c6710
+DD 0xedad0176,0x6a2793d1,0xe75452c9,0x60dec06e
+DD 0xd3bae8f4,0x54307a53,0xd943bb4b,0x5ec929ec
+DD 0xc6484f8a,0x41c2dd2d,0xccb11c35,0x4b3b8e92
+DD 0x5ac60111,0xdd4c93b6,0x503f52ae,0xd7b5c009
+DD 0x4f34a66f,0xc8be34c8,0x45cdf5d0,0xc2476777
+DD 0x71234fed,0xf6a9dd4a,0x7bda1c52,0xfc508ef5
+DD 0x64d1e893,0xe35b7a34,0x6e28bb2c,0xe9a2298b
+DD 0x0d0c9ce9,0x8a860e4e,0x07f5cf56,0x807f5df1
+DD 0x18fe3b97,0x9f74a930,0x12076828,0x958dfa8f
+DD 0x26e9d215,0xa16340b2,0x2c1081aa,0xab9a130d
+DD 0x331b756b,0xb491e7cc,0x39e226d4,0xbe68b473
+DD 0xf5533ae1,0x72d9a846,0xffaa695e,0x7820fbf9
+DD 0xe0a19d9f,0x672b0f38,0xea58ce20,0x6dd25c87
+DD 0xdeb6741d,0x593ce6ba,0xd44f27a2,0x53c5b505
+DD 0xcb44d363,0x4cce41c4,0xc1bd80dc,0x4637127b
+DD 0xa299a719,0x251335be,0xa860f4a6,0x2fea6601
+DD 0xb76b0067,0x30e192c0,0xbd9253d8,0x3a18c17f
+DD 0x897ce9e5,0x0ef67b42,0x8385ba5a,0x040f28fd
+DD 0x9c8e4e9b,0x1b04dc3c,0x96771d24,0x11fd8f83
+DD 0xb58c0222,0x32069085,0xbf75519d,0x38ffc33a
+DD 0xa07ea55c,0x27f437fb,0xaa87f6e3,0x2d0d6444
+DD 0x9e694cde,0x19e3de79,0x94901f61,0x131a8dc6
+DD 0x8b9beba0,0x0c117907,0x8162b81f,0x06e82ab8
+DD 0xe2469fda,0x65cc0d7d,0xe8bfcc65,0x6f355ec2
+DD 0xf7b438a4,0x703eaa03,0xfd4d6b1b,0x7ac7f9bc
+DD 0xc9a3d126,0x4e294381,0xc35a8299,0x44d0103e
+DD 0xdc517658,0x5bdbe4ff,0xd6a825e7,0x5122b740
+DD 0x1a1939d2,0x9d93ab75,0x10e06a6d,0x976af8ca
+DD 0x0feb9eac,0x88610c0b,0x0512cd13,0x82985fb4
+DD 0x31fc772e,0xb676e589,0x3b052491,0xbc8fb636
+DD 0x240ed050,0xa38442f7,0x2ef783ef,0xa97d1148
+DD 0x4dd3a42a,0xca59368d,0x472af795,0xc0a06532
+DD 0x58210354,0xdfab91f3,0x52d850eb,0xd552c24c
+DD 0x6636ead6,0xe1bc7871,0x6ccfb969,0xeb452bce
+DD 0x73c44da8,0xf44edf0f,0x793d1e17,0xfeb78cb0
+DD 0xef4a0333,0x68c09194,0xe5b3508c,0x6239c22b
+DD 0xfab8a44d,0x7d3236ea,0xf041f7f2,0x77cb6555
+DD 0xc4af4dcf,0x4325df68,0xce561e70,0x49dc8cd7
+DD 0xd15deab1,0x56d77816,0xdba4b90e,0x5c2e2ba9
+DD 0xb8809ecb,0x3f0a0c6c,0xb279cd74,0x35f35fd3
+DD 0xad7239b5,0x2af8ab12,0xa78b6a0a,0x2001f8ad
+DD 0x9365d037,0x14ef4290,0x999c8388,0x1e16112f
+DD 0x86977749,0x011de5ee,0x8c6e24f6,0x0be4b651
+DD 0x40df38c3,0xc755aa64,0x4a266b7c,0xcdacf9db
+DD 0x552d9fbd,0xd2a70d1a,0x5fd4cc02,0xd85e5ea5
+DD 0x6b3a763f,0xecb0e498,0x61c32580,0xe649b727
+DD 0x7ec8d141,0xf94243e6,0x743182fe,0xf3bb1059
+DD 0x1715a53b,0x909f379c,0x1decf684,0x9a666423
+DD 0x02e70245,0x856d90e2,0x081e51fa,0x8f94c35d
+DD 0x3cf0ebc7,0xbb7a7960,0x3609b878,0xb1832adf
+DD 0x29024cb9,0xae88de1e,0x23fb1f06,0xa4718da1
+
+mul_table_312:
+DD 0x00000000,0xbac2fd7b,0x70698c07,0xcaab717c
+DD 0xe0d3180e,0x5a11e575,0x90ba9409,0x2a786972
+DD 0xc44a46ed,0x7e88bb96,0xb423caea,0x0ee13791
+DD 0x24995ee3,0x9e5ba398,0x54f0d2e4,0xee322f9f
+DD 0x8d78fb2b,0x37ba0650,0xfd11772c,0x47d38a57
+DD 0x6dabe325,0xd7691e5e,0x1dc26f22,0xa7009259
+DD 0x4932bdc6,0xf3f040bd,0x395b31c1,0x8399ccba
+DD 0xa9e1a5c8,0x132358b3,0xd98829cf,0x634ad4b4
+DD 0x1f1d80a7,0xa5df7ddc,0x6f740ca0,0xd5b6f1db
+DD 0xffce98a9,0x450c65d2,0x8fa714ae,0x3565e9d5
+DD 0xdb57c64a,0x61953b31,0xab3e4a4d,0x11fcb736
+DD 0x3b84de44,0x8146233f,0x4bed5243,0xf12faf38
+DD 0x92657b8c,0x28a786f7,0xe20cf78b,0x58ce0af0
+DD 0x72b66382,0xc8749ef9,0x02dfef85,0xb81d12fe
+DD 0x562f3d61,0xecedc01a,0x2646b166,0x9c844c1d
+DD 0xb6fc256f,0x0c3ed814,0xc695a968,0x7c575413
+DD 0x3e3b014e,0x84f9fc35,0x4e528d49,0xf4907032
+DD 0xdee81940,0x642ae43b,0xae819547,0x1443683c
+DD 0xfa7147a3,0x40b3bad8,0x8a18cba4,0x30da36df
+DD 0x1aa25fad,0xa060a2d6,0x6acbd3aa,0xd0092ed1
+DD 0xb343fa65,0x0981071e,0xc32a7662,0x79e88b19
+DD 0x5390e26b,0xe9521f10,0x23f96e6c,0x993b9317
+DD 0x7709bc88,0xcdcb41f3,0x0760308f,0xbda2cdf4
+DD 0x97daa486,0x2d1859fd,0xe7b32881,0x5d71d5fa
+DD 0x212681e9,0x9be47c92,0x514f0dee,0xeb8df095
+DD 0xc1f599e7,0x7b37649c,0xb19c15e0,0x0b5ee89b
+DD 0xe56cc704,0x5fae3a7f,0x95054b03,0x2fc7b678
+DD 0x05bfdf0a,0xbf7d2271,0x75d6530d,0xcf14ae76
+DD 0xac5e7ac2,0x169c87b9,0xdc37f6c5,0x66f50bbe
+DD 0x4c8d62cc,0xf64f9fb7,0x3ce4eecb,0x862613b0
+DD 0x68143c2f,0xd2d6c154,0x187db028,0xa2bf4d53
+DD 0x88c72421,0x3205d95a,0xf8aea826,0x426c555d
+DD 0x7c76029c,0xc6b4ffe7,0x0c1f8e9b,0xb6dd73e0
+DD 0x9ca51a92,0x2667e7e9,0xeccc9695,0x560e6bee
+DD 0xb83c4471,0x02feb90a,0xc855c876,0x7297350d
+DD 0x58ef5c7f,0xe22da104,0x2886d078,0x92442d03
+DD 0xf10ef9b7,0x4bcc04cc,0x816775b0,0x3ba588cb
+DD 0x11dde1b9,0xab1f1cc2,0x61b46dbe,0xdb7690c5
+DD 0x3544bf5a,0x8f864221,0x452d335d,0xffefce26
+DD 0xd597a754,0x6f555a2f,0xa5fe2b53,0x1f3cd628
+DD 0x636b823b,0xd9a97f40,0x13020e3c,0xa9c0f347
+DD 0x83b89a35,0x397a674e,0xf3d11632,0x4913eb49
+DD 0xa721c4d6,0x1de339ad,0xd74848d1,0x6d8ab5aa
+DD 0x47f2dcd8,0xfd3021a3,0x379b50df,0x8d59ada4
+DD 0xee137910,0x54d1846b,0x9e7af517,0x24b8086c
+DD 0x0ec0611e,0xb4029c65,0x7ea9ed19,0xc46b1062
+DD 0x2a593ffd,0x909bc286,0x5a30b3fa,0xe0f24e81
+DD 0xca8a27f3,0x7048da88,0xbae3abf4,0x0021568f
+DD 0x424d03d2,0xf88ffea9,0x32248fd5,0x88e672ae
+DD 0xa29e1bdc,0x185ce6a7,0xd2f797db,0x68356aa0
+DD 0x8607453f,0x3cc5b844,0xf66ec938,0x4cac3443
+DD 0x66d45d31,0xdc16a04a,0x16bdd136,0xac7f2c4d
+DD 0xcf35f8f9,0x75f70582,0xbf5c74fe,0x059e8985
+DD 0x2fe6e0f7,0x95241d8c,0x5f8f6cf0,0xe54d918b
+DD 0x0b7fbe14,0xb1bd436f,0x7b163213,0xc1d4cf68
+DD 0xebaca61a,0x516e5b61,0x9bc52a1d,0x2107d766
+DD 0x5d508375,0xe7927e0e,0x2d390f72,0x97fbf209
+DD 0xbd839b7b,0x07416600,0xcdea177c,0x7728ea07
+DD 0x991ac598,0x23d838e3,0xe973499f,0x53b1b4e4
+DD 0x79c9dd96,0xc30b20ed,0x09a05191,0xb362acea
+DD 0xd028785e,0x6aea8525,0xa041f459,0x1a830922
+DD 0x30fb6050,0x8a399d2b,0x4092ec57,0xfa50112c
+DD 0x14623eb3,0xaea0c3c8,0x640bb2b4,0xdec94fcf
+DD 0xf4b126bd,0x4e73dbc6,0x84d8aaba,0x3e1a57c1
+
+mul_table_632:
+DD 0x00000000,0x6b749fb2,0xd6e93f64,0xbd9da0d6
+DD 0xa83e0839,0xc34a978b,0x7ed7375d,0x15a3a8ef
+DD 0x55906683,0x3ee4f931,0x837959e7,0xe80dc655
+DD 0xfdae6eba,0x96daf108,0x2b4751de,0x4033ce6c
+DD 0xab20cd06,0xc05452b4,0x7dc9f262,0x16bd6dd0
+DD 0x031ec53f,0x686a5a8d,0xd5f7fa5b,0xbe8365e9
+DD 0xfeb0ab85,0x95c43437,0x285994e1,0x432d0b53
+DD 0x568ea3bc,0x3dfa3c0e,0x80679cd8,0xeb13036a
+DD 0x53adecfd,0x38d9734f,0x8544d399,0xee304c2b
+DD 0xfb93e4c4,0x90e77b76,0x2d7adba0,0x460e4412
+DD 0x063d8a7e,0x6d4915cc,0xd0d4b51a,0xbba02aa8
+DD 0xae038247,0xc5771df5,0x78eabd23,0x139e2291
+DD 0xf88d21fb,0x93f9be49,0x2e641e9f,0x4510812d
+DD 0x50b329c2,0x3bc7b670,0x865a16a6,0xed2e8914
+DD 0xad1d4778,0xc669d8ca,0x7bf4781c,0x1080e7ae
+DD 0x05234f41,0x6e57d0f3,0xd3ca7025,0xb8beef97
+DD 0xa75bd9fa,0xcc2f4648,0x71b2e69e,0x1ac6792c
+DD 0x0f65d1c3,0x64114e71,0xd98ceea7,0xb2f87115
+DD 0xf2cbbf79,0x99bf20cb,0x2422801d,0x4f561faf
+DD 0x5af5b740,0x318128f2,0x8c1c8824,0xe7681796
+DD 0x0c7b14fc,0x670f8b4e,0xda922b98,0xb1e6b42a
+DD 0xa4451cc5,0xcf318377,0x72ac23a1,0x19d8bc13
+DD 0x59eb727f,0x329fedcd,0x8f024d1b,0xe476d2a9
+DD 0xf1d57a46,0x9aa1e5f4,0x273c4522,0x4c48da90
+DD 0xf4f63507,0x9f82aab5,0x221f0a63,0x496b95d1
+DD 0x5cc83d3e,0x37bca28c,0x8a21025a,0xe1559de8
+DD 0xa1665384,0xca12cc36,0x778f6ce0,0x1cfbf352
+DD 0x09585bbd,0x622cc40f,0xdfb164d9,0xb4c5fb6b
+DD 0x5fd6f801,0x34a267b3,0x893fc765,0xe24b58d7
+DD 0xf7e8f038,0x9c9c6f8a,0x2101cf5c,0x4a7550ee
+DD 0x0a469e82,0x61320130,0xdcafa1e6,0xb7db3e54
+DD 0xa27896bb,0xc90c0909,0x7491a9df,0x1fe5366d
+DD 0x4b5bc505,0x202f5ab7,0x9db2fa61,0xf6c665d3
+DD 0xe365cd3c,0x8811528e,0x358cf258,0x5ef86dea
+DD 0x1ecba386,0x75bf3c34,0xc8229ce2,0xa3560350
+DD 0xb6f5abbf,0xdd81340d,0x601c94db,0x0b680b69
+DD 0xe07b0803,0x8b0f97b1,0x36923767,0x5de6a8d5
+DD 0x4845003a,0x23319f88,0x9eac3f5e,0xf5d8a0ec
+DD 0xb5eb6e80,0xde9ff132,0x630251e4,0x0876ce56
+DD 0x1dd566b9,0x76a1f90b,0xcb3c59dd,0xa048c66f
+DD 0x18f629f8,0x7382b64a,0xce1f169c,0xa56b892e
+DD 0xb0c821c1,0xdbbcbe73,0x66211ea5,0x0d558117
+DD 0x4d664f7b,0x2612d0c9,0x9b8f701f,0xf0fbefad
+DD 0xe5584742,0x8e2cd8f0,0x33b17826,0x58c5e794
+DD 0xb3d6e4fe,0xd8a27b4c,0x653fdb9a,0x0e4b4428
+DD 0x1be8ecc7,0x709c7375,0xcd01d3a3,0xa6754c11
+DD 0xe646827d,0x8d321dcf,0x30afbd19,0x5bdb22ab
+DD 0x4e788a44,0x250c15f6,0x9891b520,0xf3e52a92
+DD 0xec001cff,0x8774834d,0x3ae9239b,0x519dbc29
+DD 0x443e14c6,0x2f4a8b74,0x92d72ba2,0xf9a3b410
+DD 0xb9907a7c,0xd2e4e5ce,0x6f794518,0x040ddaaa
+DD 0x11ae7245,0x7adaedf7,0xc7474d21,0xac33d293
+DD 0x4720d1f9,0x2c544e4b,0x91c9ee9d,0xfabd712f
+DD 0xef1ed9c0,0x846a4672,0x39f7e6a4,0x52837916
+DD 0x12b0b77a,0x79c428c8,0xc459881e,0xaf2d17ac
+DD 0xba8ebf43,0xd1fa20f1,0x6c678027,0x07131f95
+DD 0xbfadf002,0xd4d96fb0,0x6944cf66,0x023050d4
+DD 0x1793f83b,0x7ce76789,0xc17ac75f,0xaa0e58ed
+DD 0xea3d9681,0x81490933,0x3cd4a9e5,0x57a03657
+DD 0x42039eb8,0x2977010a,0x94eaa1dc,0xff9e3e6e
+DD 0x148d3d04,0x7ff9a2b6,0xc2640260,0xa9109dd2
+DD 0xbcb3353d,0xd7c7aa8f,0x6a5a0a59,0x012e95eb
+DD 0x411d5b87,0x2a69c435,0x97f464e3,0xfc80fb51
+DD 0xe92353be,0x8257cc0c,0x3fca6cda,0x54bef368
+
+mul_table_1272:
+DD 0x00000000,0xdd66cbbb,0xbf21e187,0x62472a3c
+DD 0x7bafb5ff,0xa6c97e44,0xc48e5478,0x19e89fc3
+DD 0xf75f6bfe,0x2a39a045,0x487e8a79,0x951841c2
+DD 0x8cf0de01,0x519615ba,0x33d13f86,0xeeb7f43d
+DD 0xeb52a10d,0x36346ab6,0x5473408a,0x89158b31
+DD 0x90fd14f2,0x4d9bdf49,0x2fdcf575,0xf2ba3ece
+DD 0x1c0dcaf3,0xc16b0148,0xa32c2b74,0x7e4ae0cf
+DD 0x67a27f0c,0xbac4b4b7,0xd8839e8b,0x05e55530
+DD 0xd34934eb,0x0e2fff50,0x6c68d56c,0xb10e1ed7
+DD 0xa8e68114,0x75804aaf,0x17c76093,0xcaa1ab28
+DD 0x24165f15,0xf97094ae,0x9b37be92,0x46517529
+DD 0x5fb9eaea,0x82df2151,0xe0980b6d,0x3dfec0d6
+DD 0x381b95e6,0xe57d5e5d,0x873a7461,0x5a5cbfda
+DD 0x43b42019,0x9ed2eba2,0xfc95c19e,0x21f30a25
+DD 0xcf44fe18,0x122235a3,0x70651f9f,0xad03d424
+DD 0xb4eb4be7,0x698d805c,0x0bcaaa60,0xd6ac61db
+DD 0xa37e1f27,0x7e18d49c,0x1c5ffea0,0xc139351b
+DD 0xd8d1aad8,0x05b76163,0x67f04b5f,0xba9680e4
+DD 0x542174d9,0x8947bf62,0xeb00955e,0x36665ee5
+DD 0x2f8ec126,0xf2e80a9d,0x90af20a1,0x4dc9eb1a
+DD 0x482cbe2a,0x954a7591,0xf70d5fad,0x2a6b9416
+DD 0x33830bd5,0xeee5c06e,0x8ca2ea52,0x51c421e9
+DD 0xbf73d5d4,0x62151e6f,0x00523453,0xdd34ffe8
+DD 0xc4dc602b,0x19baab90,0x7bfd81ac,0xa69b4a17
+DD 0x70372bcc,0xad51e077,0xcf16ca4b,0x127001f0
+DD 0x0b989e33,0xd6fe5588,0xb4b97fb4,0x69dfb40f
+DD 0x87684032,0x5a0e8b89,0x3849a1b5,0xe52f6a0e
+DD 0xfcc7f5cd,0x21a13e76,0x43e6144a,0x9e80dff1
+DD 0x9b658ac1,0x4603417a,0x24446b46,0xf922a0fd
+DD 0xe0ca3f3e,0x3dacf485,0x5febdeb9,0x828d1502
+DD 0x6c3ae13f,0xb15c2a84,0xd31b00b8,0x0e7dcb03
+DD 0x179554c0,0xcaf39f7b,0xa8b4b547,0x75d27efc
+DD 0x431048bf,0x9e768304,0xfc31a938,0x21576283
+DD 0x38bffd40,0xe5d936fb,0x879e1cc7,0x5af8d77c
+DD 0xb44f2341,0x6929e8fa,0x0b6ec2c6,0xd608097d
+DD 0xcfe096be,0x12865d05,0x70c17739,0xada7bc82
+DD 0xa842e9b2,0x75242209,0x17630835,0xca05c38e
+DD 0xd3ed5c4d,0x0e8b97f6,0x6cccbdca,0xb1aa7671
+DD 0x5f1d824c,0x827b49f7,0xe03c63cb,0x3d5aa870
+DD 0x24b237b3,0xf9d4fc08,0x9b93d634,0x46f51d8f
+DD 0x90597c54,0x4d3fb7ef,0x2f789dd3,0xf21e5668
+DD 0xebf6c9ab,0x36900210,0x54d7282c,0x89b1e397
+DD 0x670617aa,0xba60dc11,0xd827f62d,0x05413d96
+DD 0x1ca9a255,0xc1cf69ee,0xa38843d2,0x7eee8869
+DD 0x7b0bdd59,0xa66d16e2,0xc42a3cde,0x194cf765
+DD 0x00a468a6,0xddc2a31d,0xbf858921,0x62e3429a
+DD 0x8c54b6a7,0x51327d1c,0x33755720,0xee139c9b
+DD 0xf7fb0358,0x2a9dc8e3,0x48dae2df,0x95bc2964
+DD 0xe06e5798,0x3d089c23,0x5f4fb61f,0x82297da4
+DD 0x9bc1e267,0x46a729dc,0x24e003e0,0xf986c85b
+DD 0x17313c66,0xca57f7dd,0xa810dde1,0x7576165a
+DD 0x6c9e8999,0xb1f84222,0xd3bf681e,0x0ed9a3a5
+DD 0x0b3cf695,0xd65a3d2e,0xb41d1712,0x697bdca9
+DD 0x7093436a,0xadf588d1,0xcfb2a2ed,0x12d46956
+DD 0xfc639d6b,0x210556d0,0x43427cec,0x9e24b757
+DD 0x87cc2894,0x5aaae32f,0x38edc913,0xe58b02a8
+DD 0x33276373,0xee41a8c8,0x8c0682f4,0x5160494f
+DD 0x4888d68c,0x95ee1d37,0xf7a9370b,0x2acffcb0
+DD 0xc478088d,0x191ec336,0x7b59e90a,0xa63f22b1
+DD 0xbfd7bd72,0x62b176c9,0x00f65cf5,0xdd90974e
+DD 0xd875c27e,0x051309c5,0x675423f9,0xba32e842
+DD 0xa3da7781,0x7ebcbc3a,0x1cfb9606,0xc19d5dbd
+DD 0x2f2aa980,0xf24c623b,0x900b4807,0x4d6d83bc
+DD 0x54851c7f,0x89e3d7c4,0xeba4fdf8,0x36c23643
+
+;;; func core, ver, snum
+slversion crc32_iscsi_00, 00, 04, 0014
+
diff --git a/src/isa-l/crc/crc32_iscsi_01.asm b/src/isa-l/crc/crc32_iscsi_01.asm
new file mode 100644
index 000000000..e0f2b5e82
--- /dev/null
+++ b/src/isa-l/crc/crc32_iscsi_01.asm
@@ -0,0 +1,592 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+%include "reg_sizes.asm"
+
+default rel
+%define CONCAT(a,b,c) a %+ b %+ c
+
+; Define threshold where buffers are considered "small" and routed to more
+; efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
+; SMALL_SIZE can be no larger than 256.
+%define SMALL_SIZE 200
+
+%if (SMALL_SIZE > 256)
+%error SMALL_ SIZE must be <= 256
+% error ; needed because '%error' actually generates only a warning
+%endif
+
+;;; unsigned int crc32_iscsi_01(unsigned char * buffer, int len, unsigned int crc_init);
+;;;
+;;; *buf = rcx
+;;; len = rdx
+;;; crc_init = r8
+
+mk_global crc32_iscsi_01, function
+crc32_iscsi_01:
+ endbranch
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define bufp rdi
+%define bufp_dw edi
+%define bufp_w di
+%define bufp_b dil
+%define bufptmp rcx
+%define block_0 rcx
+%define block_1 rdx
+%define block_2 r11
+%define len rsi
+%define len_dw esi
+%define len_w si
+%define len_b sil
+%define crc_init_arg rdx
+%else
+%define bufp rcx
+%define bufp_dw ecx
+%define bufp_w cx
+%define bufp_b cl
+%define bufptmp rdi
+%define block_0 rdi
+%define block_1 rsi
+%define block_2 r11
+%define len rdx
+%define len_dw edx
+%define len_w dx
+%define len_b dl
+%endif
+
+%define tmp rbx
+%define crc_init r8
+%define crc_init_dw r8d
+%define crc1 r9
+%define crc2 r10
+
+ push rbx
+ push rdi
+ push rsi
+
+ ;; Move crc_init for Linux to a different reg
+%ifidn __OUTPUT_FORMAT__, elf64
+ mov crc_init, crc_init_arg
+%endif
+
+ ;; If len is less than 8 we need to jump to special code to avoid
+ ;; reading beyond the end of the buffer
+ cmp len, 8
+ jb less_than_8
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov bufptmp, bufp ;; rdi = *buf
+ neg bufp
+ and bufp, 7 ;; calculate the unalignment amount of
+ ;; the address
+ je proc_block ;; Skip if aligned
+
+ ;;;; Calculate CRC of unaligned bytes of the buffer (if any) ;;;
+ mov tmp, [bufptmp] ;; load a quadword from the buffer
+ add bufptmp, bufp ;; align buffer pointer for quadword
+ ;; processing
+ sub len, bufp ;; update buffer length
+align_loop:
+ crc32 crc_init_dw, bl ;; compute crc32 of 1-byte
+ shr tmp, 8 ;; get next byte
+ dec bufp
+ jne align_loop
+
+proc_block:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2) PROCESS BLOCKS: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; compute num of bytes to be processed
+ mov tmp, len ;; save num bytes in tmp
+
+ cmp len, 128*24
+ jae full_block
+
+continue_block:
+ cmp len, SMALL_SIZE
+ jb small
+
+ ;; len < 128*24
+ mov rax, 2731 ;; 2731 = ceil(2^16 / 24)
+ mul len_dw
+ shr rax, 16
+
+ ;; eax contains floor(bytes / 24) = num 24-byte chunks to do
+
+ ;; process rax 24-byte chunks (128 >= rax >= 0)
+
+ ;; compute end address of each block
+ ;; rdi -> block 0 (base addr + RAX * 8)
+ ;; rsi -> block 1 (base addr + RAX * 16)
+ ;; r11 -> block 2 (base addr + RAX * 24)
+ lea block_0, [bufptmp + rax * 8]
+ lea block_1, [block_0 + rax * 8]
+ lea block_2, [block_1 + rax * 8]
+
+ xor crc1,crc1
+ xor crc2,crc2
+
+ ;; branch into array
+ lea bufp, [jump_table]
+ movzx len, word [bufp + rax * 2] ;; len is offset from crc_array
+ lea bufp, [bufp + len + crc_array - jump_table]
+ jmp bufp
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2a) PROCESS FULL BLOCKS: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+full_block:
+ mov rax, 128
+ lea block_1, [block_0 + 128*8*2]
+ lea block_2, [block_0 + 128*8*3]
+ add block_0, 128*8*1
+
+ xor crc1,crc1
+ xor crc2,crc2
+
+; ;; branch into array
+; jmp CONCAT(crc_,128,)
+ ; Fall thruogh into top of crc array (crc_128)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 3) CRC Array: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+crc_array:
+ cmp len, 128*24*2
+ jbe non_prefetch
+
+%assign i 128
+%rep 128-1
+
+CONCAT(_crc_,i,:)
+ crc32 crc_init, qword [block_0 - i*8]
+ crc32 crc1, qword [block_1 - i*8]
+ crc32 crc2, qword [block_2 - i*8]
+
+ %if i > 128*8 / 32 ; prefetch next 3KB data
+ prefetchnta [block_2 + 128*32 - i*32]
+ %endif
+
+%assign i (i-1)
+%endrep
+ jmp next_
+
+non_prefetch:
+%assign i 128
+%rep 128-1
+
+CONCAT(crc_,i,:)
+ endbranch
+ crc32 crc_init, qword [block_0 - i*8]
+ crc32 crc1, qword [block_1 - i*8]
+ crc32 crc2, qword [block_2 - i*8]
+%assign i (i-1)
+%endrep
+
+next_:
+CONCAT(crc_,i,:)
+ crc32 crc_init, qword [block_0 - i*8]
+ crc32 crc1, qword [block_1 - i*8]
+; SKIP ;crc32 crc2, [block_2 - i*8] ; Don't do this one yet
+
+ mov block_0, block_2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 4) Combine three results: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ lea bufp, [K_table - 16] ; first entry is for idx 1
+ shl rax, 3 ; rax *= 8
+ sub tmp, rax ; tmp -= rax*8
+ shl rax, 1
+ sub tmp, rax ; tmp -= rax*16 (total tmp -= rax*24)
+ add bufp, rax
+
+ movdqa xmm0, [bufp] ; 2 consts: K1:K2
+
+ movq xmm1, crc_init ; CRC for block 1
+ pclmulqdq xmm1, xmm0, 0x00 ; Multiply by K2
+
+ movq xmm2, crc1 ; CRC for block 2
+ pclmulqdq xmm2, xmm0, 0x10 ; Multiply by K1
+
+ pxor xmm1, xmm2
+ movq rax, xmm1
+ xor rax, [block_2 - i*8]
+ mov crc_init, crc2
+ crc32 crc_init, rax
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 5) Check for end: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+CONCAT(crc_,0,:)
+ mov len, tmp
+ cmp tmp, 128*24
+ jae full_block
+ cmp tmp, 24
+ jae continue_block
+
+fewer_than_24:
+ ;; now fewer than 24 bytes remain
+ cmp tmp, 16
+ jae do_16
+ cmp tmp, 8
+ jae do_8
+
+ ;; 0 <= tmp <= 7
+ shl ebx, 29 ; size now in bits 31:29
+ jz do_return
+check_4:
+ mov bufp, [bufptmp]
+ shl ebx, 1 ; shift out into carry MSB (orig size & 4)
+ jnc check_2
+ crc32 crc_init_dw, bufp_dw
+ jz do_return
+ shr bufp, 32 ; shift data down by 4 bytes
+check_2:
+ shl ebx, 1 ; shift out into carry MSB (orig size & 2)
+ jnc check_1
+ crc32 crc_init_dw, bufp_w
+ jz do_return
+ shr bufp, 16 ; shift data down by 2 bytes
+check_1:
+ crc32 crc_init_dw, bufp_b
+
+do_return:
+ mov rax, crc_init
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+do_8:
+ crc32 crc_init, qword [bufptmp]
+ add bufptmp, 8
+ shl ebx, 29 ; size (0...7) in bits 31:29
+ jnz check_4
+ mov rax, crc_init
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+do_16:
+ crc32 crc_init, qword [bufptmp]
+ crc32 crc_init, qword [bufptmp+8]
+ add bufptmp, 16
+ shl ebx, 29 ; size (0...7) in bits 31:29
+ jnz check_4
+ mov rax, crc_init
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Handle the case of fewer than 8 bytes, unaligned. In this case
+ ;; we can't read 8 bytes, as this might go beyond the end of the buffer
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+less_than_8:
+ test len,4
+ jz less_than_4
+ crc32 crc_init_dw, dword[bufp]
+ add bufp,4
+less_than_4:
+ test len,2
+ jz less_than_2
+ crc32 crc_init_dw, word[bufp]
+ add bufp,2
+less_than_2:
+ test len,1
+ jz do_return
+ crc32 crc_init_dw, byte[bufp]
+ mov rax, crc_init
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+
+small:
+ mov rax, crc_init
+
+bit8:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit7 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 16
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 128 ;; buf +=64; (next 64 bytes)
+
+bit7:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit6 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 8
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 64 ;; buf +=64; (next 64 bytes)
+bit6:
+ shl len_b, 1 ;; shift-out MSB (bit-6)
+ jnc bit5 ;; jump to bit-5 if bit-6 == 0
+ %assign i 0
+ %rep 4
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 32 ;; buf +=32; (next 32 bytes)
+bit5:
+ shl len_b, 1 ;; shift-out MSB (bit-5)
+ jnc bit4 ;; jump to bit-4 if bit-5 == 0
+ %assign i 0
+ %rep 2
+ crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 16 ;; buf +=16; (next 16 bytes)
+bit4:
+ shl len_b, 1 ;; shift-out MSB (bit-4)
+ jnc bit3 ;; jump to bit-3 if bit-4 == 0
+ crc32 rax, qword [bufptmp] ;; compute crc32 of 8-byte data
+ je do_return2 ;; return if remaining data is zero
+ add bufptmp, 8 ;; buf +=8; (next 8 bytes)
+bit3:
+ mov rbx, qword [bufptmp] ;; load a 8-bytes from the buffer:
+ shl len_b, 1 ;; shift-out MSB (bit-3)
+ jnc bit2 ;; jump to bit-2 if bit-3 == 0
+ crc32 eax, ebx ;; compute crc32 of 4-byte data
+ je do_return2 ;; return if remaining data is zero
+ shr rbx, 32 ;; get next 3 bytes
+bit2:
+ shl len_b, 1 ;; shift-out MSB (bit-2)
+ jnc bit1 ;; jump to bit-1 if bit-2 == 0
+ crc32 eax, bx ;; compute crc32 of 2-byte data
+ je do_return2 ;; return if remaining data is zero
+ shr rbx, 16 ;; next byte
+bit1:
+ test len_b,len_b
+ je do_return2
+ crc32 eax, bl ;; compute crc32 of 1-byte data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+do_return2:
+ pop rsi
+ pop rdi
+ pop rbx
+ ret
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; jump table ;; Table is 129 entries x 2 bytes each
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+align 4
+jump_table:
+%assign i 0
+%rep 129
+ dw CONCAT(crc_,i,) - crc_array
+%assign i (i+1)
+%endrep
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; PCLMULQDQ tables
+ ;; Table is 128 entries x 2 quad words each
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .data
+align 64
+K_table:
+ dq 0x14cd00bd6, 0x105ec76f0
+ dq 0x0ba4fc28e, 0x14cd00bd6
+ dq 0x1d82c63da, 0x0f20c0dfe
+ dq 0x09e4addf8, 0x0ba4fc28e
+ dq 0x039d3b296, 0x1384aa63a
+ dq 0x102f9b8a2, 0x1d82c63da
+ dq 0x14237f5e6, 0x01c291d04
+ dq 0x00d3b6092, 0x09e4addf8
+ dq 0x0c96cfdc0, 0x0740eef02
+ dq 0x18266e456, 0x039d3b296
+ dq 0x0daece73e, 0x0083a6eec
+ dq 0x0ab7aff2a, 0x102f9b8a2
+ dq 0x1248ea574, 0x1c1733996
+ dq 0x083348832, 0x14237f5e6
+ dq 0x12c743124, 0x02ad91c30
+ dq 0x0b9e02b86, 0x00d3b6092
+ dq 0x018b33a4e, 0x06992cea2
+ dq 0x1b331e26a, 0x0c96cfdc0
+ dq 0x17d35ba46, 0x07e908048
+ dq 0x1bf2e8b8a, 0x18266e456
+ dq 0x1a3e0968a, 0x11ed1f9d8
+ dq 0x0ce7f39f4, 0x0daece73e
+ dq 0x061d82e56, 0x0f1d0f55e
+ dq 0x0d270f1a2, 0x0ab7aff2a
+ dq 0x1c3f5f66c, 0x0a87ab8a8
+ dq 0x12ed0daac, 0x1248ea574
+ dq 0x065863b64, 0x08462d800
+ dq 0x11eef4f8e, 0x083348832
+ dq 0x1ee54f54c, 0x071d111a8
+ dq 0x0b3e32c28, 0x12c743124
+ dq 0x0064f7f26, 0x0ffd852c6
+ dq 0x0dd7e3b0c, 0x0b9e02b86
+ dq 0x0f285651c, 0x0dcb17aa4
+ dq 0x010746f3c, 0x018b33a4e
+ dq 0x1c24afea4, 0x0f37c5aee
+ dq 0x0271d9844, 0x1b331e26a
+ dq 0x08e766a0c, 0x06051d5a2
+ dq 0x093a5f730, 0x17d35ba46
+ dq 0x06cb08e5c, 0x11d5ca20e
+ dq 0x06b749fb2, 0x1bf2e8b8a
+ dq 0x1167f94f2, 0x021f3d99c
+ dq 0x0cec3662e, 0x1a3e0968a
+ dq 0x19329634a, 0x08f158014
+ dq 0x0e6fc4e6a, 0x0ce7f39f4
+ dq 0x08227bb8a, 0x1a5e82106
+ dq 0x0b0cd4768, 0x061d82e56
+ dq 0x13c2b89c4, 0x188815ab2
+ dq 0x0d7a4825c, 0x0d270f1a2
+ dq 0x10f5ff2ba, 0x105405f3e
+ dq 0x00167d312, 0x1c3f5f66c
+ dq 0x0f6076544, 0x0e9adf796
+ dq 0x026f6a60a, 0x12ed0daac
+ dq 0x1a2adb74e, 0x096638b34
+ dq 0x19d34af3a, 0x065863b64
+ dq 0x049c3cc9c, 0x1e50585a0
+ dq 0x068bce87a, 0x11eef4f8e
+ dq 0x1524fa6c6, 0x19f1c69dc
+ dq 0x16cba8aca, 0x1ee54f54c
+ dq 0x042d98888, 0x12913343e
+ dq 0x1329d9f7e, 0x0b3e32c28
+ dq 0x1b1c69528, 0x088f25a3a
+ dq 0x02178513a, 0x0064f7f26
+ dq 0x0e0ac139e, 0x04e36f0b0
+ dq 0x0170076fa, 0x0dd7e3b0c
+ dq 0x141a1a2e2, 0x0bd6f81f8
+ dq 0x16ad828b4, 0x0f285651c
+ dq 0x041d17b64, 0x19425cbba
+ dq 0x1fae1cc66, 0x010746f3c
+ dq 0x1a75b4b00, 0x18db37e8a
+ dq 0x0f872e54c, 0x1c24afea4
+ dq 0x01e41e9fc, 0x04c144932
+ dq 0x086d8e4d2, 0x0271d9844
+ dq 0x160f7af7a, 0x052148f02
+ dq 0x05bb8f1bc, 0x08e766a0c
+ dq 0x0a90fd27a, 0x0a3c6f37a
+ dq 0x0b3af077a, 0x093a5f730
+ dq 0x04984d782, 0x1d22c238e
+ dq 0x0ca6ef3ac, 0x06cb08e5c
+ dq 0x0234e0b26, 0x063ded06a
+ dq 0x1d88abd4a, 0x06b749fb2
+ dq 0x04597456a, 0x04d56973c
+ dq 0x0e9e28eb4, 0x1167f94f2
+ dq 0x07b3ff57a, 0x19385bf2e
+ dq 0x0c9c8b782, 0x0cec3662e
+ dq 0x13a9cba9e, 0x0e417f38a
+ dq 0x093e106a4, 0x19329634a
+ dq 0x167001a9c, 0x14e727980
+ dq 0x1ddffc5d4, 0x0e6fc4e6a
+ dq 0x00df04680, 0x0d104b8fc
+ dq 0x02342001e, 0x08227bb8a
+ dq 0x00a2a8d7e, 0x05b397730
+ dq 0x168763fa6, 0x0b0cd4768
+ dq 0x1ed5a407a, 0x0e78eb416
+ dq 0x0d2c3ed1a, 0x13c2b89c4
+ dq 0x0995a5724, 0x1641378f0
+ dq 0x19b1afbc4, 0x0d7a4825c
+ dq 0x109ffedc0, 0x08d96551c
+ dq 0x0f2271e60, 0x10f5ff2ba
+ dq 0x00b0bf8ca, 0x00bf80dd2
+ dq 0x123888b7a, 0x00167d312
+ dq 0x1e888f7dc, 0x18dcddd1c
+ dq 0x002ee03b2, 0x0f6076544
+ dq 0x183e8d8fe, 0x06a45d2b2
+ dq 0x133d7a042, 0x026f6a60a
+ dq 0x116b0f50c, 0x1dd3e10e8
+ dq 0x05fabe670, 0x1a2adb74e
+ dq 0x130004488, 0x0de87806c
+ dq 0x000bcf5f6, 0x19d34af3a
+ dq 0x18f0c7078, 0x014338754
+ dq 0x017f27698, 0x049c3cc9c
+ dq 0x058ca5f00, 0x15e3e77ee
+ dq 0x1af900c24, 0x068bce87a
+ dq 0x0b5cfca28, 0x0dd07448e
+ dq 0x0ded288f8, 0x1524fa6c6
+ dq 0x059f229bc, 0x1d8048348
+ dq 0x06d390dec, 0x16cba8aca
+ dq 0x037170390, 0x0a3e3e02c
+ dq 0x06353c1cc, 0x042d98888
+ dq 0x0c4584f5c, 0x0d73c7bea
+ dq 0x1f16a3418, 0x1329d9f7e
+ dq 0x0531377e2, 0x185137662
+ dq 0x1d8d9ca7c, 0x1b1c69528
+ dq 0x0b25b29f2, 0x18a08b5bc
+ dq 0x19fb2a8b0, 0x02178513a
+ dq 0x1a08fe6ac, 0x1da758ae0
+ dq 0x045cddf4e, 0x0e0ac139e
+ dq 0x1a91647f2, 0x169cf9eb0
+ dq 0x1a0f717c4, 0x0170076fa
+
+;;; func core, ver, snum
+slversion crc32_iscsi_01, 01, 04, 0015
+
diff --git a/src/isa-l/crc/crc32_iscsi_by16_10.asm b/src/isa-l/crc/crc32_iscsi_by16_10.asm
new file mode 100644
index 000000000..4c63bab39
--- /dev/null
+++ b/src/isa-l/crc/crc32_iscsi_by16_10.asm
@@ -0,0 +1,556 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; UINT32 crc32_iscsi_by16_10(
+; UINT32 init_crc, //initial CRC value, 32 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; UINT64 len //buffer length in bytes (64-bit data)
+; );
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+;
+;
+
+%include "reg_sizes.asm"
+
+%ifndef FUNCTION_NAME
+%define FUNCTION_NAME crc32_iscsi_by16_10
+%endif
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 r8
+ %xdefine arg2 rcx
+ %xdefine arg3 rdx
+
+ %xdefine arg1_low32 r8d
+%else
+ %xdefine arg1 rdx
+ %xdefine arg2 rdi
+ %xdefine arg3 rsi
+
+ %xdefine arg1_low32 edx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*12+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global FUNCTION_NAME, function
+FUNCTION_NAME:
+ endbranch
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+ jl .less_than_256
+
+ ; load the initial crc value
+ vmovd xmm10, arg1_low32 ; initial crc
+
+ ; receive the initial 64B data, xor the initial crc value
+ vmovdqu8 zmm0, [arg2+16*0]
+ vmovdqu8 zmm4, [arg2+16*4]
+ vpxorq zmm0, zmm10
+ vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+
+ sub arg3, 256
+ cmp arg3, 256
+ jl .fold_128_B_loop
+
+ vmovdqu8 zmm7, [arg2+16*8]
+ vmovdqu8 zmm8, [arg2+16*12]
+ vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
+ sub arg3, 256
+
+.fold_256_B_loop:
+ add arg2, 256
+ vmovdqu8 zmm3, [arg2+16*0]
+ vpclmulqdq zmm1, zmm0, zmm16, 0x10
+ vpclmulqdq zmm2, zmm0, zmm16, 0x01
+ vpxorq zmm0, zmm1, zmm2
+ vpxorq zmm0, zmm0, zmm3
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpclmulqdq zmm5, zmm4, zmm16, 0x10
+ vpclmulqdq zmm6, zmm4, zmm16, 0x01
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ vmovdqu8 zmm11, [arg2+16*8]
+ vpclmulqdq zmm12, zmm7, zmm16, 0x10
+ vpclmulqdq zmm13, zmm7, zmm16, 0x01
+ vpxorq zmm7, zmm12, zmm13
+ vpxorq zmm7, zmm7, zmm11
+
+ vmovdqu8 zmm17, [arg2+16*12]
+ vpclmulqdq zmm14, zmm8, zmm16, 0x10
+ vpclmulqdq zmm15, zmm8, zmm16, 0x01
+ vpxorq zmm8, zmm14, zmm15
+ vpxorq zmm8, zmm8, zmm17
+
+ sub arg3, 256
+ jge .fold_256_B_loop
+
+ ;; Fold 256 into 128
+ add arg2, 256
+ vpclmulqdq zmm1, zmm0, zmm10, 0x01
+ vpclmulqdq zmm2, zmm0, zmm10, 0x10
+ vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
+
+ vpclmulqdq zmm5, zmm4, zmm10, 0x01
+ vpclmulqdq zmm6, zmm4, zmm10, 0x10
+ vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
+
+ vmovdqa32 zmm0, zmm7
+ vmovdqa32 zmm4, zmm8
+
+ add arg3, 128
+ jmp .fold_128_B_register
+
+
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+.fold_128_B_loop:
+ add arg2, 128
+ vmovdqu8 zmm8, [arg2+16*0]
+ vpclmulqdq zmm2, zmm0, zmm10, 0x10
+ vpclmulqdq zmm1, zmm0, zmm10, 0x01
+ vpxorq zmm0, zmm2, zmm1
+ vpxorq zmm0, zmm0, zmm8
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpclmulqdq zmm5, zmm4, zmm10, 0x10
+ vpclmulqdq zmm6, zmm4, zmm10, 0x01
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ sub arg3, 128
+ jge .fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+.fold_128_B_register:
+ ; fold the 8 128b parts into 1 xmm register with different constants
+ vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
+ vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
+ vpclmulqdq zmm1, zmm0, zmm16, 0x01
+ vpclmulqdq zmm2, zmm0, zmm16, 0x10
+ vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
+
+ vpclmulqdq zmm5, zmm4, zmm11, 0x01
+ vpclmulqdq zmm6, zmm4, zmm11, 0x10
+ vmovdqa xmm10, [rk1] ; Needed later in reduction loop
+ vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
+ vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
+
+ vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
+ vpxorq ymm8, ymm8, ymm1
+ vextracti64x2 xmm5, ymm8, 1
+ vpxorq xmm7, xmm5, xmm8
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl .final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+.16B_reduction_loop:
+ vpclmulqdq xmm8, xmm7, xmm10, 0x1
+ vpclmulqdq xmm7, xmm7, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vmovdqu xmm0, [arg2]
+ vpxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge .16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+.final_reduction_for_128:
+ add arg3, 16
+ je .128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset
+ ; the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+.get_last_two_xmms:
+
+ vmovdqa xmm2, xmm7
+ vmovdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ vmovdqu xmm0, [rax]
+
+ vpshufb xmm7, xmm0
+ vpxor xmm0, [mask3]
+ vpshufb xmm2, xmm0
+
+ vpblendvb xmm2, xmm2, xmm1, xmm0
+ ;;;;;;;;;;
+ vpclmulqdq xmm8, xmm7, xmm10, 0x1
+ vpclmulqdq xmm7, xmm7, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+.128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm10, [rk5]
+ vmovdqa xmm0, xmm7
+
+ ;64b fold
+ vpclmulqdq xmm7, xmm10, 0
+ vpsrldq xmm0, 8
+ vpxor xmm7, xmm0
+
+ ;32b fold
+ vmovdqa xmm0, xmm7
+ vpslldq xmm7, 4
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm0
+
+
+ ;barrett reduction
+.barrett:
+ vpand xmm7, [mask2]
+ vmovdqa xmm1, xmm7
+ vmovdqa xmm2, xmm7
+ vmovdqa xmm10, [rk7]
+
+ vpclmulqdq xmm7, xmm10, 0
+ vpxor xmm7, xmm2
+ vpand xmm7, [mask]
+ vmovdqa xmm2, xmm7
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpxor xmm7, xmm2
+ vpxor xmm7, xmm1
+ vpextrd eax, xmm7, 2
+
+.cleanup:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+.less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl .less_than_32
+
+ ; if there is, load the constants
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp .16B_reduction_loop
+
+
+align 16
+.less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov eax, arg1_low32
+ test arg3, arg3
+ je .cleanup
+
+ vmovd xmm0, arg1_low32 ; get the initial crc value
+
+ cmp arg3, 16
+ je .exact_16_left
+ jl .less_than_16_left
+
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp .get_last_two_xmms
+
+align 16
+.less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ cmp arg3, 4
+ jl .only_less_than_4
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl .less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+.less_than_8_left:
+
+ cmp arg3, 4
+ jl .less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+.less_than_4_left:
+
+ cmp arg3, 2
+ jl .less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+.less_than_2_left:
+ cmp arg3, 1
+ jl .zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+.zero_left:
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+ vmovdqu xmm0, [rax + r9]
+ vpshufb xmm7,xmm0
+ jmp .128_done
+
+align 16
+.exact_16_left:
+ vmovdqu xmm7, [arg2]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ jmp .128_done
+
+.only_less_than_4:
+ cmp arg3, 3
+ jl .only_less_than_3
+
+ ; load 3 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ mov al, [arg2+2]
+ mov [r11+2], al
+
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpslldq xmm7, 5
+ jmp .barrett
+
+.only_less_than_3:
+ cmp arg3, 2
+ jl .only_less_than_2
+
+ ; load 2 Bytes
+ mov al, [arg2]
+ mov [r11], al
+
+ mov al, [arg2+1]
+ mov [r11+1], al
+
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpslldq xmm7, 6
+ jmp .barrett
+
+.only_less_than_2:
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ vpslldq xmm7, 7
+ jmp .barrett
+
+section .data
+align 32
+
+%ifndef USE_CONSTS
+; precomputed constants
+rk_1: dq 0x00000000b9e02b86
+rk_2: dq 0x00000000dcb17aa4
+rk1: dq 0x00000000493c7d27
+rk2: dq 0x0000000ec1068c50
+rk3: dq 0x0000000206e38d70
+rk4: dq 0x000000006992cea2
+rk5: dq 0x00000000493c7d27
+rk6: dq 0x00000000dd45aab8
+rk7: dq 0x00000000dea713f0
+rk8: dq 0x0000000105ec76f0
+rk9: dq 0x0000000047db8317
+rk10: dq 0x000000002ad91c30
+rk11: dq 0x000000000715ce53
+rk12: dq 0x00000000c49f4f67
+rk13: dq 0x0000000039d3b296
+rk14: dq 0x00000000083a6eec
+rk15: dq 0x000000009e4addf8
+rk16: dq 0x00000000740eef02
+rk17: dq 0x00000000ddc0152b
+rk18: dq 0x000000001c291d04
+rk19: dq 0x00000000ba4fc28e
+rk20: dq 0x000000003da6d0cb
+
+rk_1b: dq 0x00000000493c7d27
+rk_2b: dq 0x0000000ec1068c50
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+
+%else
+INCLUDE_CONSTS
+%endif
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3: dq 0x8080808080808080, 0x8080808080808080
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_ %+ FUNCTION_NAME
+no_ %+ FUNCTION_NAME %+ :
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/isa-l/crc/crc32_iscsi_perf.c b/src/isa-l/crc/crc32_iscsi_perf.c
new file mode 100644
index 000000000..d768cdfa6
--- /dev/null
+++ b/src/isa-l/crc/crc32_iscsi_perf.c
@@ -0,0 +1,79 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+ void *buf;
+ uint32_t crc;
+ struct perf start;
+
+ printf("crc32_iscsi_perf:\n");
+
+ if (posix_memalign(&buf, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ memset(buf, 0, TEST_LEN);
+ BENCHMARK(&start, BENCHMARK_TIME, crc = crc32_iscsi(buf, TEST_LEN, TEST_SEED));
+ printf("crc32_iscsi" TEST_TYPE_STR ": ");
+ perf_print(start, (long long)TEST_LEN);
+
+ printf("finish 0x%x\n", crc);
+ return 0;
+}
diff --git a/src/isa-l/crc/crc64_base.c b/src/isa-l/crc/crc64_base.c
new file mode 100644
index 000000000..7cf5a69cf
--- /dev/null
+++ b/src/isa-l/crc/crc64_base.c
@@ -0,0 +1,912 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "crc64.h"
+
+static const uint64_t crc64_ecma_refl_table[256] = {
+ 0x0000000000000000ULL, 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL, 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL, 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL, 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL, 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL, 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL, 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL, 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL, 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL, 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL, 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL, 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL, 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL, 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL, 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL, 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL, 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL, 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL, 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL, 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL, 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL, 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL, 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL, 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL, 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL, 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL, 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL, 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL, 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL, 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL, 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL, 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL, 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL, 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL, 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL, 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL, 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL, 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL, 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL, 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL, 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL, 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL, 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL, 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL, 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL, 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL, 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL, 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL, 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL, 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL, 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL, 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL, 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL, 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL, 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL, 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL, 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL, 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL, 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL, 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL, 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL, 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL, 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL, 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL, 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL, 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL, 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL, 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL, 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL, 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL, 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL, 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL, 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL, 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL, 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL, 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL, 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL, 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL, 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL, 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL, 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL, 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL, 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL, 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL, 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL, 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL, 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL, 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL, 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL, 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL, 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL, 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL, 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL, 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL, 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL, 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL, 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL, 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL, 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL, 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL, 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL, 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL, 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL, 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL, 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL, 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL, 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL, 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL, 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL, 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL, 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL, 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL, 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL, 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL, 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL, 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL, 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL, 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL, 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL, 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL, 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL, 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL, 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL, 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL, 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL, 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL, 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL, 0xe0ada17364673f59ULL
+};
+
+static const uint64_t crc64_ecma_norm_table[256] = {
+ 0x0000000000000000ULL, 0x42f0e1eba9ea3693ULL,
+ 0x85e1c3d753d46d26ULL, 0xc711223cfa3e5bb5ULL,
+ 0x493366450e42ecdfULL, 0x0bc387aea7a8da4cULL,
+ 0xccd2a5925d9681f9ULL, 0x8e224479f47cb76aULL,
+ 0x9266cc8a1c85d9beULL, 0xd0962d61b56fef2dULL,
+ 0x17870f5d4f51b498ULL, 0x5577eeb6e6bb820bULL,
+ 0xdb55aacf12c73561ULL, 0x99a54b24bb2d03f2ULL,
+ 0x5eb4691841135847ULL, 0x1c4488f3e8f96ed4ULL,
+ 0x663d78ff90e185efULL, 0x24cd9914390bb37cULL,
+ 0xe3dcbb28c335e8c9ULL, 0xa12c5ac36adfde5aULL,
+ 0x2f0e1eba9ea36930ULL, 0x6dfeff5137495fa3ULL,
+ 0xaaefdd6dcd770416ULL, 0xe81f3c86649d3285ULL,
+ 0xf45bb4758c645c51ULL, 0xb6ab559e258e6ac2ULL,
+ 0x71ba77a2dfb03177ULL, 0x334a9649765a07e4ULL,
+ 0xbd68d2308226b08eULL, 0xff9833db2bcc861dULL,
+ 0x388911e7d1f2dda8ULL, 0x7a79f00c7818eb3bULL,
+ 0xcc7af1ff21c30bdeULL, 0x8e8a101488293d4dULL,
+ 0x499b3228721766f8ULL, 0x0b6bd3c3dbfd506bULL,
+ 0x854997ba2f81e701ULL, 0xc7b97651866bd192ULL,
+ 0x00a8546d7c558a27ULL, 0x4258b586d5bfbcb4ULL,
+ 0x5e1c3d753d46d260ULL, 0x1cecdc9e94ace4f3ULL,
+ 0xdbfdfea26e92bf46ULL, 0x990d1f49c77889d5ULL,
+ 0x172f5b3033043ebfULL, 0x55dfbadb9aee082cULL,
+ 0x92ce98e760d05399ULL, 0xd03e790cc93a650aULL,
+ 0xaa478900b1228e31ULL, 0xe8b768eb18c8b8a2ULL,
+ 0x2fa64ad7e2f6e317ULL, 0x6d56ab3c4b1cd584ULL,
+ 0xe374ef45bf6062eeULL, 0xa1840eae168a547dULL,
+ 0x66952c92ecb40fc8ULL, 0x2465cd79455e395bULL,
+ 0x3821458aada7578fULL, 0x7ad1a461044d611cULL,
+ 0xbdc0865dfe733aa9ULL, 0xff3067b657990c3aULL,
+ 0x711223cfa3e5bb50ULL, 0x33e2c2240a0f8dc3ULL,
+ 0xf4f3e018f031d676ULL, 0xb60301f359dbe0e5ULL,
+ 0xda050215ea6c212fULL, 0x98f5e3fe438617bcULL,
+ 0x5fe4c1c2b9b84c09ULL, 0x1d14202910527a9aULL,
+ 0x93366450e42ecdf0ULL, 0xd1c685bb4dc4fb63ULL,
+ 0x16d7a787b7faa0d6ULL, 0x5427466c1e109645ULL,
+ 0x4863ce9ff6e9f891ULL, 0x0a932f745f03ce02ULL,
+ 0xcd820d48a53d95b7ULL, 0x8f72eca30cd7a324ULL,
+ 0x0150a8daf8ab144eULL, 0x43a04931514122ddULL,
+ 0x84b16b0dab7f7968ULL, 0xc6418ae602954ffbULL,
+ 0xbc387aea7a8da4c0ULL, 0xfec89b01d3679253ULL,
+ 0x39d9b93d2959c9e6ULL, 0x7b2958d680b3ff75ULL,
+ 0xf50b1caf74cf481fULL, 0xb7fbfd44dd257e8cULL,
+ 0x70eadf78271b2539ULL, 0x321a3e938ef113aaULL,
+ 0x2e5eb66066087d7eULL, 0x6cae578bcfe24bedULL,
+ 0xabbf75b735dc1058ULL, 0xe94f945c9c3626cbULL,
+ 0x676dd025684a91a1ULL, 0x259d31cec1a0a732ULL,
+ 0xe28c13f23b9efc87ULL, 0xa07cf2199274ca14ULL,
+ 0x167ff3eacbaf2af1ULL, 0x548f120162451c62ULL,
+ 0x939e303d987b47d7ULL, 0xd16ed1d631917144ULL,
+ 0x5f4c95afc5edc62eULL, 0x1dbc74446c07f0bdULL,
+ 0xdaad56789639ab08ULL, 0x985db7933fd39d9bULL,
+ 0x84193f60d72af34fULL, 0xc6e9de8b7ec0c5dcULL,
+ 0x01f8fcb784fe9e69ULL, 0x43081d5c2d14a8faULL,
+ 0xcd2a5925d9681f90ULL, 0x8fdab8ce70822903ULL,
+ 0x48cb9af28abc72b6ULL, 0x0a3b7b1923564425ULL,
+ 0x70428b155b4eaf1eULL, 0x32b26afef2a4998dULL,
+ 0xf5a348c2089ac238ULL, 0xb753a929a170f4abULL,
+ 0x3971ed50550c43c1ULL, 0x7b810cbbfce67552ULL,
+ 0xbc902e8706d82ee7ULL, 0xfe60cf6caf321874ULL,
+ 0xe224479f47cb76a0ULL, 0xa0d4a674ee214033ULL,
+ 0x67c58448141f1b86ULL, 0x253565a3bdf52d15ULL,
+ 0xab1721da49899a7fULL, 0xe9e7c031e063acecULL,
+ 0x2ef6e20d1a5df759ULL, 0x6c0603e6b3b7c1caULL,
+ 0xf6fae5c07d3274cdULL, 0xb40a042bd4d8425eULL,
+ 0x731b26172ee619ebULL, 0x31ebc7fc870c2f78ULL,
+ 0xbfc9838573709812ULL, 0xfd39626eda9aae81ULL,
+ 0x3a28405220a4f534ULL, 0x78d8a1b9894ec3a7ULL,
+ 0x649c294a61b7ad73ULL, 0x266cc8a1c85d9be0ULL,
+ 0xe17dea9d3263c055ULL, 0xa38d0b769b89f6c6ULL,
+ 0x2daf4f0f6ff541acULL, 0x6f5faee4c61f773fULL,
+ 0xa84e8cd83c212c8aULL, 0xeabe6d3395cb1a19ULL,
+ 0x90c79d3fedd3f122ULL, 0xd2377cd44439c7b1ULL,
+ 0x15265ee8be079c04ULL, 0x57d6bf0317edaa97ULL,
+ 0xd9f4fb7ae3911dfdULL, 0x9b041a914a7b2b6eULL,
+ 0x5c1538adb04570dbULL, 0x1ee5d94619af4648ULL,
+ 0x02a151b5f156289cULL, 0x4051b05e58bc1e0fULL,
+ 0x87409262a28245baULL, 0xc5b073890b687329ULL,
+ 0x4b9237f0ff14c443ULL, 0x0962d61b56fef2d0ULL,
+ 0xce73f427acc0a965ULL, 0x8c8315cc052a9ff6ULL,
+ 0x3a80143f5cf17f13ULL, 0x7870f5d4f51b4980ULL,
+ 0xbf61d7e80f251235ULL, 0xfd913603a6cf24a6ULL,
+ 0x73b3727a52b393ccULL, 0x31439391fb59a55fULL,
+ 0xf652b1ad0167feeaULL, 0xb4a25046a88dc879ULL,
+ 0xa8e6d8b54074a6adULL, 0xea16395ee99e903eULL,
+ 0x2d071b6213a0cb8bULL, 0x6ff7fa89ba4afd18ULL,
+ 0xe1d5bef04e364a72ULL, 0xa3255f1be7dc7ce1ULL,
+ 0x64347d271de22754ULL, 0x26c49cccb40811c7ULL,
+ 0x5cbd6cc0cc10fafcULL, 0x1e4d8d2b65facc6fULL,
+ 0xd95caf179fc497daULL, 0x9bac4efc362ea149ULL,
+ 0x158e0a85c2521623ULL, 0x577eeb6e6bb820b0ULL,
+ 0x906fc95291867b05ULL, 0xd29f28b9386c4d96ULL,
+ 0xcedba04ad0952342ULL, 0x8c2b41a1797f15d1ULL,
+ 0x4b3a639d83414e64ULL, 0x09ca82762aab78f7ULL,
+ 0x87e8c60fded7cf9dULL, 0xc51827e4773df90eULL,
+ 0x020905d88d03a2bbULL, 0x40f9e43324e99428ULL,
+ 0x2cffe7d5975e55e2ULL, 0x6e0f063e3eb46371ULL,
+ 0xa91e2402c48a38c4ULL, 0xebeec5e96d600e57ULL,
+ 0x65cc8190991cb93dULL, 0x273c607b30f68faeULL,
+ 0xe02d4247cac8d41bULL, 0xa2dda3ac6322e288ULL,
+ 0xbe992b5f8bdb8c5cULL, 0xfc69cab42231bacfULL,
+ 0x3b78e888d80fe17aULL, 0x7988096371e5d7e9ULL,
+ 0xf7aa4d1a85996083ULL, 0xb55aacf12c735610ULL,
+ 0x724b8ecdd64d0da5ULL, 0x30bb6f267fa73b36ULL,
+ 0x4ac29f2a07bfd00dULL, 0x08327ec1ae55e69eULL,
+ 0xcf235cfd546bbd2bULL, 0x8dd3bd16fd818bb8ULL,
+ 0x03f1f96f09fd3cd2ULL, 0x41011884a0170a41ULL,
+ 0x86103ab85a2951f4ULL, 0xc4e0db53f3c36767ULL,
+ 0xd8a453a01b3a09b3ULL, 0x9a54b24bb2d03f20ULL,
+ 0x5d45907748ee6495ULL, 0x1fb5719ce1045206ULL,
+ 0x919735e51578e56cULL, 0xd367d40ebc92d3ffULL,
+ 0x1476f63246ac884aULL, 0x568617d9ef46bed9ULL,
+ 0xe085162ab69d5e3cULL, 0xa275f7c11f7768afULL,
+ 0x6564d5fde549331aULL, 0x279434164ca30589ULL,
+ 0xa9b6706fb8dfb2e3ULL, 0xeb46918411358470ULL,
+ 0x2c57b3b8eb0bdfc5ULL, 0x6ea7525342e1e956ULL,
+ 0x72e3daa0aa188782ULL, 0x30133b4b03f2b111ULL,
+ 0xf7021977f9cceaa4ULL, 0xb5f2f89c5026dc37ULL,
+ 0x3bd0bce5a45a6b5dULL, 0x79205d0e0db05dceULL,
+ 0xbe317f32f78e067bULL, 0xfcc19ed95e6430e8ULL,
+ 0x86b86ed5267cdbd3ULL, 0xc4488f3e8f96ed40ULL,
+ 0x0359ad0275a8b6f5ULL, 0x41a94ce9dc428066ULL,
+ 0xcf8b0890283e370cULL, 0x8d7be97b81d4019fULL,
+ 0x4a6acb477bea5a2aULL, 0x089a2aacd2006cb9ULL,
+ 0x14dea25f3af9026dULL, 0x562e43b4931334feULL,
+ 0x913f6188692d6f4bULL, 0xd3cf8063c0c759d8ULL,
+ 0x5dedc41a34bbeeb2ULL, 0x1f1d25f19d51d821ULL,
+ 0xd80c07cd676f8394ULL, 0x9afce626ce85b507ULL
+};
+
+static const uint64_t crc64_iso_refl_table[256] = {
+ 0x0000000000000000ULL, 0x01b0000000000000ULL,
+ 0x0360000000000000ULL, 0x02d0000000000000ULL,
+ 0x06c0000000000000ULL, 0x0770000000000000ULL,
+ 0x05a0000000000000ULL, 0x0410000000000000ULL,
+ 0x0d80000000000000ULL, 0x0c30000000000000ULL,
+ 0x0ee0000000000000ULL, 0x0f50000000000000ULL,
+ 0x0b40000000000000ULL, 0x0af0000000000000ULL,
+ 0x0820000000000000ULL, 0x0990000000000000ULL,
+ 0x1b00000000000000ULL, 0x1ab0000000000000ULL,
+ 0x1860000000000000ULL, 0x19d0000000000000ULL,
+ 0x1dc0000000000000ULL, 0x1c70000000000000ULL,
+ 0x1ea0000000000000ULL, 0x1f10000000000000ULL,
+ 0x1680000000000000ULL, 0x1730000000000000ULL,
+ 0x15e0000000000000ULL, 0x1450000000000000ULL,
+ 0x1040000000000000ULL, 0x11f0000000000000ULL,
+ 0x1320000000000000ULL, 0x1290000000000000ULL,
+ 0x3600000000000000ULL, 0x37b0000000000000ULL,
+ 0x3560000000000000ULL, 0x34d0000000000000ULL,
+ 0x30c0000000000000ULL, 0x3170000000000000ULL,
+ 0x33a0000000000000ULL, 0x3210000000000000ULL,
+ 0x3b80000000000000ULL, 0x3a30000000000000ULL,
+ 0x38e0000000000000ULL, 0x3950000000000000ULL,
+ 0x3d40000000000000ULL, 0x3cf0000000000000ULL,
+ 0x3e20000000000000ULL, 0x3f90000000000000ULL,
+ 0x2d00000000000000ULL, 0x2cb0000000000000ULL,
+ 0x2e60000000000000ULL, 0x2fd0000000000000ULL,
+ 0x2bc0000000000000ULL, 0x2a70000000000000ULL,
+ 0x28a0000000000000ULL, 0x2910000000000000ULL,
+ 0x2080000000000000ULL, 0x2130000000000000ULL,
+ 0x23e0000000000000ULL, 0x2250000000000000ULL,
+ 0x2640000000000000ULL, 0x27f0000000000000ULL,
+ 0x2520000000000000ULL, 0x2490000000000000ULL,
+ 0x6c00000000000000ULL, 0x6db0000000000000ULL,
+ 0x6f60000000000000ULL, 0x6ed0000000000000ULL,
+ 0x6ac0000000000000ULL, 0x6b70000000000000ULL,
+ 0x69a0000000000000ULL, 0x6810000000000000ULL,
+ 0x6180000000000000ULL, 0x6030000000000000ULL,
+ 0x62e0000000000000ULL, 0x6350000000000000ULL,
+ 0x6740000000000000ULL, 0x66f0000000000000ULL,
+ 0x6420000000000000ULL, 0x6590000000000000ULL,
+ 0x7700000000000000ULL, 0x76b0000000000000ULL,
+ 0x7460000000000000ULL, 0x75d0000000000000ULL,
+ 0x71c0000000000000ULL, 0x7070000000000000ULL,
+ 0x72a0000000000000ULL, 0x7310000000000000ULL,
+ 0x7a80000000000000ULL, 0x7b30000000000000ULL,
+ 0x79e0000000000000ULL, 0x7850000000000000ULL,
+ 0x7c40000000000000ULL, 0x7df0000000000000ULL,
+ 0x7f20000000000000ULL, 0x7e90000000000000ULL,
+ 0x5a00000000000000ULL, 0x5bb0000000000000ULL,
+ 0x5960000000000000ULL, 0x58d0000000000000ULL,
+ 0x5cc0000000000000ULL, 0x5d70000000000000ULL,
+ 0x5fa0000000000000ULL, 0x5e10000000000000ULL,
+ 0x5780000000000000ULL, 0x5630000000000000ULL,
+ 0x54e0000000000000ULL, 0x5550000000000000ULL,
+ 0x5140000000000000ULL, 0x50f0000000000000ULL,
+ 0x5220000000000000ULL, 0x5390000000000000ULL,
+ 0x4100000000000000ULL, 0x40b0000000000000ULL,
+ 0x4260000000000000ULL, 0x43d0000000000000ULL,
+ 0x47c0000000000000ULL, 0x4670000000000000ULL,
+ 0x44a0000000000000ULL, 0x4510000000000000ULL,
+ 0x4c80000000000000ULL, 0x4d30000000000000ULL,
+ 0x4fe0000000000000ULL, 0x4e50000000000000ULL,
+ 0x4a40000000000000ULL, 0x4bf0000000000000ULL,
+ 0x4920000000000000ULL, 0x4890000000000000ULL,
+ 0xd800000000000000ULL, 0xd9b0000000000000ULL,
+ 0xdb60000000000000ULL, 0xdad0000000000000ULL,
+ 0xdec0000000000000ULL, 0xdf70000000000000ULL,
+ 0xdda0000000000000ULL, 0xdc10000000000000ULL,
+ 0xd580000000000000ULL, 0xd430000000000000ULL,
+ 0xd6e0000000000000ULL, 0xd750000000000000ULL,
+ 0xd340000000000000ULL, 0xd2f0000000000000ULL,
+ 0xd020000000000000ULL, 0xd190000000000000ULL,
+ 0xc300000000000000ULL, 0xc2b0000000000000ULL,
+ 0xc060000000000000ULL, 0xc1d0000000000000ULL,
+ 0xc5c0000000000000ULL, 0xc470000000000000ULL,
+ 0xc6a0000000000000ULL, 0xc710000000000000ULL,
+ 0xce80000000000000ULL, 0xcf30000000000000ULL,
+ 0xcde0000000000000ULL, 0xcc50000000000000ULL,
+ 0xc840000000000000ULL, 0xc9f0000000000000ULL,
+ 0xcb20000000000000ULL, 0xca90000000000000ULL,
+ 0xee00000000000000ULL, 0xefb0000000000000ULL,
+ 0xed60000000000000ULL, 0xecd0000000000000ULL,
+ 0xe8c0000000000000ULL, 0xe970000000000000ULL,
+ 0xeba0000000000000ULL, 0xea10000000000000ULL,
+ 0xe380000000000000ULL, 0xe230000000000000ULL,
+ 0xe0e0000000000000ULL, 0xe150000000000000ULL,
+ 0xe540000000000000ULL, 0xe4f0000000000000ULL,
+ 0xe620000000000000ULL, 0xe790000000000000ULL,
+ 0xf500000000000000ULL, 0xf4b0000000000000ULL,
+ 0xf660000000000000ULL, 0xf7d0000000000000ULL,
+ 0xf3c0000000000000ULL, 0xf270000000000000ULL,
+ 0xf0a0000000000000ULL, 0xf110000000000000ULL,
+ 0xf880000000000000ULL, 0xf930000000000000ULL,
+ 0xfbe0000000000000ULL, 0xfa50000000000000ULL,
+ 0xfe40000000000000ULL, 0xfff0000000000000ULL,
+ 0xfd20000000000000ULL, 0xfc90000000000000ULL,
+ 0xb400000000000000ULL, 0xb5b0000000000000ULL,
+ 0xb760000000000000ULL, 0xb6d0000000000000ULL,
+ 0xb2c0000000000000ULL, 0xb370000000000000ULL,
+ 0xb1a0000000000000ULL, 0xb010000000000000ULL,
+ 0xb980000000000000ULL, 0xb830000000000000ULL,
+ 0xbae0000000000000ULL, 0xbb50000000000000ULL,
+ 0xbf40000000000000ULL, 0xbef0000000000000ULL,
+ 0xbc20000000000000ULL, 0xbd90000000000000ULL,
+ 0xaf00000000000000ULL, 0xaeb0000000000000ULL,
+ 0xac60000000000000ULL, 0xadd0000000000000ULL,
+ 0xa9c0000000000000ULL, 0xa870000000000000ULL,
+ 0xaaa0000000000000ULL, 0xab10000000000000ULL,
+ 0xa280000000000000ULL, 0xa330000000000000ULL,
+ 0xa1e0000000000000ULL, 0xa050000000000000ULL,
+ 0xa440000000000000ULL, 0xa5f0000000000000ULL,
+ 0xa720000000000000ULL, 0xa690000000000000ULL,
+ 0x8200000000000000ULL, 0x83b0000000000000ULL,
+ 0x8160000000000000ULL, 0x80d0000000000000ULL,
+ 0x84c0000000000000ULL, 0x8570000000000000ULL,
+ 0x87a0000000000000ULL, 0x8610000000000000ULL,
+ 0x8f80000000000000ULL, 0x8e30000000000000ULL,
+ 0x8ce0000000000000ULL, 0x8d50000000000000ULL,
+ 0x8940000000000000ULL, 0x88f0000000000000ULL,
+ 0x8a20000000000000ULL, 0x8b90000000000000ULL,
+ 0x9900000000000000ULL, 0x98b0000000000000ULL,
+ 0x9a60000000000000ULL, 0x9bd0000000000000ULL,
+ 0x9fc0000000000000ULL, 0x9e70000000000000ULL,
+ 0x9ca0000000000000ULL, 0x9d10000000000000ULL,
+ 0x9480000000000000ULL, 0x9530000000000000ULL,
+ 0x97e0000000000000ULL, 0x9650000000000000ULL,
+ 0x9240000000000000ULL, 0x93f0000000000000ULL,
+ 0x9120000000000000ULL, 0x9090000000000000ULL
+};
+
+static const uint64_t crc64_iso_norm_table[256] = {
+ 0x0000000000000000ULL, 0x000000000000001bULL,
+ 0x0000000000000036ULL, 0x000000000000002dULL,
+ 0x000000000000006cULL, 0x0000000000000077ULL,
+ 0x000000000000005aULL, 0x0000000000000041ULL,
+ 0x00000000000000d8ULL, 0x00000000000000c3ULL,
+ 0x00000000000000eeULL, 0x00000000000000f5ULL,
+ 0x00000000000000b4ULL, 0x00000000000000afULL,
+ 0x0000000000000082ULL, 0x0000000000000099ULL,
+ 0x00000000000001b0ULL, 0x00000000000001abULL,
+ 0x0000000000000186ULL, 0x000000000000019dULL,
+ 0x00000000000001dcULL, 0x00000000000001c7ULL,
+ 0x00000000000001eaULL, 0x00000000000001f1ULL,
+ 0x0000000000000168ULL, 0x0000000000000173ULL,
+ 0x000000000000015eULL, 0x0000000000000145ULL,
+ 0x0000000000000104ULL, 0x000000000000011fULL,
+ 0x0000000000000132ULL, 0x0000000000000129ULL,
+ 0x0000000000000360ULL, 0x000000000000037bULL,
+ 0x0000000000000356ULL, 0x000000000000034dULL,
+ 0x000000000000030cULL, 0x0000000000000317ULL,
+ 0x000000000000033aULL, 0x0000000000000321ULL,
+ 0x00000000000003b8ULL, 0x00000000000003a3ULL,
+ 0x000000000000038eULL, 0x0000000000000395ULL,
+ 0x00000000000003d4ULL, 0x00000000000003cfULL,
+ 0x00000000000003e2ULL, 0x00000000000003f9ULL,
+ 0x00000000000002d0ULL, 0x00000000000002cbULL,
+ 0x00000000000002e6ULL, 0x00000000000002fdULL,
+ 0x00000000000002bcULL, 0x00000000000002a7ULL,
+ 0x000000000000028aULL, 0x0000000000000291ULL,
+ 0x0000000000000208ULL, 0x0000000000000213ULL,
+ 0x000000000000023eULL, 0x0000000000000225ULL,
+ 0x0000000000000264ULL, 0x000000000000027fULL,
+ 0x0000000000000252ULL, 0x0000000000000249ULL,
+ 0x00000000000006c0ULL, 0x00000000000006dbULL,
+ 0x00000000000006f6ULL, 0x00000000000006edULL,
+ 0x00000000000006acULL, 0x00000000000006b7ULL,
+ 0x000000000000069aULL, 0x0000000000000681ULL,
+ 0x0000000000000618ULL, 0x0000000000000603ULL,
+ 0x000000000000062eULL, 0x0000000000000635ULL,
+ 0x0000000000000674ULL, 0x000000000000066fULL,
+ 0x0000000000000642ULL, 0x0000000000000659ULL,
+ 0x0000000000000770ULL, 0x000000000000076bULL,
+ 0x0000000000000746ULL, 0x000000000000075dULL,
+ 0x000000000000071cULL, 0x0000000000000707ULL,
+ 0x000000000000072aULL, 0x0000000000000731ULL,
+ 0x00000000000007a8ULL, 0x00000000000007b3ULL,
+ 0x000000000000079eULL, 0x0000000000000785ULL,
+ 0x00000000000007c4ULL, 0x00000000000007dfULL,
+ 0x00000000000007f2ULL, 0x00000000000007e9ULL,
+ 0x00000000000005a0ULL, 0x00000000000005bbULL,
+ 0x0000000000000596ULL, 0x000000000000058dULL,
+ 0x00000000000005ccULL, 0x00000000000005d7ULL,
+ 0x00000000000005faULL, 0x00000000000005e1ULL,
+ 0x0000000000000578ULL, 0x0000000000000563ULL,
+ 0x000000000000054eULL, 0x0000000000000555ULL,
+ 0x0000000000000514ULL, 0x000000000000050fULL,
+ 0x0000000000000522ULL, 0x0000000000000539ULL,
+ 0x0000000000000410ULL, 0x000000000000040bULL,
+ 0x0000000000000426ULL, 0x000000000000043dULL,
+ 0x000000000000047cULL, 0x0000000000000467ULL,
+ 0x000000000000044aULL, 0x0000000000000451ULL,
+ 0x00000000000004c8ULL, 0x00000000000004d3ULL,
+ 0x00000000000004feULL, 0x00000000000004e5ULL,
+ 0x00000000000004a4ULL, 0x00000000000004bfULL,
+ 0x0000000000000492ULL, 0x0000000000000489ULL,
+ 0x0000000000000d80ULL, 0x0000000000000d9bULL,
+ 0x0000000000000db6ULL, 0x0000000000000dadULL,
+ 0x0000000000000decULL, 0x0000000000000df7ULL,
+ 0x0000000000000ddaULL, 0x0000000000000dc1ULL,
+ 0x0000000000000d58ULL, 0x0000000000000d43ULL,
+ 0x0000000000000d6eULL, 0x0000000000000d75ULL,
+ 0x0000000000000d34ULL, 0x0000000000000d2fULL,
+ 0x0000000000000d02ULL, 0x0000000000000d19ULL,
+ 0x0000000000000c30ULL, 0x0000000000000c2bULL,
+ 0x0000000000000c06ULL, 0x0000000000000c1dULL,
+ 0x0000000000000c5cULL, 0x0000000000000c47ULL,
+ 0x0000000000000c6aULL, 0x0000000000000c71ULL,
+ 0x0000000000000ce8ULL, 0x0000000000000cf3ULL,
+ 0x0000000000000cdeULL, 0x0000000000000cc5ULL,
+ 0x0000000000000c84ULL, 0x0000000000000c9fULL,
+ 0x0000000000000cb2ULL, 0x0000000000000ca9ULL,
+ 0x0000000000000ee0ULL, 0x0000000000000efbULL,
+ 0x0000000000000ed6ULL, 0x0000000000000ecdULL,
+ 0x0000000000000e8cULL, 0x0000000000000e97ULL,
+ 0x0000000000000ebaULL, 0x0000000000000ea1ULL,
+ 0x0000000000000e38ULL, 0x0000000000000e23ULL,
+ 0x0000000000000e0eULL, 0x0000000000000e15ULL,
+ 0x0000000000000e54ULL, 0x0000000000000e4fULL,
+ 0x0000000000000e62ULL, 0x0000000000000e79ULL,
+ 0x0000000000000f50ULL, 0x0000000000000f4bULL,
+ 0x0000000000000f66ULL, 0x0000000000000f7dULL,
+ 0x0000000000000f3cULL, 0x0000000000000f27ULL,
+ 0x0000000000000f0aULL, 0x0000000000000f11ULL,
+ 0x0000000000000f88ULL, 0x0000000000000f93ULL,
+ 0x0000000000000fbeULL, 0x0000000000000fa5ULL,
+ 0x0000000000000fe4ULL, 0x0000000000000fffULL,
+ 0x0000000000000fd2ULL, 0x0000000000000fc9ULL,
+ 0x0000000000000b40ULL, 0x0000000000000b5bULL,
+ 0x0000000000000b76ULL, 0x0000000000000b6dULL,
+ 0x0000000000000b2cULL, 0x0000000000000b37ULL,
+ 0x0000000000000b1aULL, 0x0000000000000b01ULL,
+ 0x0000000000000b98ULL, 0x0000000000000b83ULL,
+ 0x0000000000000baeULL, 0x0000000000000bb5ULL,
+ 0x0000000000000bf4ULL, 0x0000000000000befULL,
+ 0x0000000000000bc2ULL, 0x0000000000000bd9ULL,
+ 0x0000000000000af0ULL, 0x0000000000000aebULL,
+ 0x0000000000000ac6ULL, 0x0000000000000addULL,
+ 0x0000000000000a9cULL, 0x0000000000000a87ULL,
+ 0x0000000000000aaaULL, 0x0000000000000ab1ULL,
+ 0x0000000000000a28ULL, 0x0000000000000a33ULL,
+ 0x0000000000000a1eULL, 0x0000000000000a05ULL,
+ 0x0000000000000a44ULL, 0x0000000000000a5fULL,
+ 0x0000000000000a72ULL, 0x0000000000000a69ULL,
+ 0x0000000000000820ULL, 0x000000000000083bULL,
+ 0x0000000000000816ULL, 0x000000000000080dULL,
+ 0x000000000000084cULL, 0x0000000000000857ULL,
+ 0x000000000000087aULL, 0x0000000000000861ULL,
+ 0x00000000000008f8ULL, 0x00000000000008e3ULL,
+ 0x00000000000008ceULL, 0x00000000000008d5ULL,
+ 0x0000000000000894ULL, 0x000000000000088fULL,
+ 0x00000000000008a2ULL, 0x00000000000008b9ULL,
+ 0x0000000000000990ULL, 0x000000000000098bULL,
+ 0x00000000000009a6ULL, 0x00000000000009bdULL,
+ 0x00000000000009fcULL, 0x00000000000009e7ULL,
+ 0x00000000000009caULL, 0x00000000000009d1ULL,
+ 0x0000000000000948ULL, 0x0000000000000953ULL,
+ 0x000000000000097eULL, 0x0000000000000965ULL,
+ 0x0000000000000924ULL, 0x000000000000093fULL,
+ 0x0000000000000912ULL, 0x0000000000000909ULL
+};
+
+static const uint64_t crc64_jones_refl_table[256] = {
+ 0x0000000000000000ULL, 0x7ad870c830358979ULL,
+ 0xf5b0e190606b12f2ULL, 0x8f689158505e9b8bULL,
+ 0xc038e5739841b68fULL, 0xbae095bba8743ff6ULL,
+ 0x358804e3f82aa47dULL, 0x4f50742bc81f2d04ULL,
+ 0xab28ecb46814fe75ULL, 0xd1f09c7c5821770cULL,
+ 0x5e980d24087fec87ULL, 0x24407dec384a65feULL,
+ 0x6b1009c7f05548faULL, 0x11c8790fc060c183ULL,
+ 0x9ea0e857903e5a08ULL, 0xe478989fa00bd371ULL,
+ 0x7d08ff3b88be6f81ULL, 0x07d08ff3b88be6f8ULL,
+ 0x88b81eabe8d57d73ULL, 0xf2606e63d8e0f40aULL,
+ 0xbd301a4810ffd90eULL, 0xc7e86a8020ca5077ULL,
+ 0x4880fbd87094cbfcULL, 0x32588b1040a14285ULL,
+ 0xd620138fe0aa91f4ULL, 0xacf86347d09f188dULL,
+ 0x2390f21f80c18306ULL, 0x594882d7b0f40a7fULL,
+ 0x1618f6fc78eb277bULL, 0x6cc0863448deae02ULL,
+ 0xe3a8176c18803589ULL, 0x997067a428b5bcf0ULL,
+ 0xfa11fe77117cdf02ULL, 0x80c98ebf2149567bULL,
+ 0x0fa11fe77117cdf0ULL, 0x75796f2f41224489ULL,
+ 0x3a291b04893d698dULL, 0x40f16bccb908e0f4ULL,
+ 0xcf99fa94e9567b7fULL, 0xb5418a5cd963f206ULL,
+ 0x513912c379682177ULL, 0x2be1620b495da80eULL,
+ 0xa489f35319033385ULL, 0xde51839b2936bafcULL,
+ 0x9101f7b0e12997f8ULL, 0xebd98778d11c1e81ULL,
+ 0x64b116208142850aULL, 0x1e6966e8b1770c73ULL,
+ 0x8719014c99c2b083ULL, 0xfdc17184a9f739faULL,
+ 0x72a9e0dcf9a9a271ULL, 0x08719014c99c2b08ULL,
+ 0x4721e43f0183060cULL, 0x3df994f731b68f75ULL,
+ 0xb29105af61e814feULL, 0xc849756751dd9d87ULL,
+ 0x2c31edf8f1d64ef6ULL, 0x56e99d30c1e3c78fULL,
+ 0xd9810c6891bd5c04ULL, 0xa3597ca0a188d57dULL,
+ 0xec09088b6997f879ULL, 0x96d1784359a27100ULL,
+ 0x19b9e91b09fcea8bULL, 0x636199d339c963f2ULL,
+ 0xdf7adabd7a6e2d6fULL, 0xa5a2aa754a5ba416ULL,
+ 0x2aca3b2d1a053f9dULL, 0x50124be52a30b6e4ULL,
+ 0x1f423fcee22f9be0ULL, 0x659a4f06d21a1299ULL,
+ 0xeaf2de5e82448912ULL, 0x902aae96b271006bULL,
+ 0x74523609127ad31aULL, 0x0e8a46c1224f5a63ULL,
+ 0x81e2d7997211c1e8ULL, 0xfb3aa75142244891ULL,
+ 0xb46ad37a8a3b6595ULL, 0xceb2a3b2ba0eececULL,
+ 0x41da32eaea507767ULL, 0x3b024222da65fe1eULL,
+ 0xa2722586f2d042eeULL, 0xd8aa554ec2e5cb97ULL,
+ 0x57c2c41692bb501cULL, 0x2d1ab4dea28ed965ULL,
+ 0x624ac0f56a91f461ULL, 0x1892b03d5aa47d18ULL,
+ 0x97fa21650afae693ULL, 0xed2251ad3acf6feaULL,
+ 0x095ac9329ac4bc9bULL, 0x7382b9faaaf135e2ULL,
+ 0xfcea28a2faafae69ULL, 0x8632586aca9a2710ULL,
+ 0xc9622c4102850a14ULL, 0xb3ba5c8932b0836dULL,
+ 0x3cd2cdd162ee18e6ULL, 0x460abd1952db919fULL,
+ 0x256b24ca6b12f26dULL, 0x5fb354025b277b14ULL,
+ 0xd0dbc55a0b79e09fULL, 0xaa03b5923b4c69e6ULL,
+ 0xe553c1b9f35344e2ULL, 0x9f8bb171c366cd9bULL,
+ 0x10e3202993385610ULL, 0x6a3b50e1a30ddf69ULL,
+ 0x8e43c87e03060c18ULL, 0xf49bb8b633338561ULL,
+ 0x7bf329ee636d1eeaULL, 0x012b592653589793ULL,
+ 0x4e7b2d0d9b47ba97ULL, 0x34a35dc5ab7233eeULL,
+ 0xbbcbcc9dfb2ca865ULL, 0xc113bc55cb19211cULL,
+ 0x5863dbf1e3ac9decULL, 0x22bbab39d3991495ULL,
+ 0xadd33a6183c78f1eULL, 0xd70b4aa9b3f20667ULL,
+ 0x985b3e827bed2b63ULL, 0xe2834e4a4bd8a21aULL,
+ 0x6debdf121b863991ULL, 0x1733afda2bb3b0e8ULL,
+ 0xf34b37458bb86399ULL, 0x8993478dbb8deae0ULL,
+ 0x06fbd6d5ebd3716bULL, 0x7c23a61ddbe6f812ULL,
+ 0x3373d23613f9d516ULL, 0x49aba2fe23cc5c6fULL,
+ 0xc6c333a67392c7e4ULL, 0xbc1b436e43a74e9dULL,
+ 0x95ac9329ac4bc9b5ULL, 0xef74e3e19c7e40ccULL,
+ 0x601c72b9cc20db47ULL, 0x1ac40271fc15523eULL,
+ 0x5594765a340a7f3aULL, 0x2f4c0692043ff643ULL,
+ 0xa02497ca54616dc8ULL, 0xdafce7026454e4b1ULL,
+ 0x3e847f9dc45f37c0ULL, 0x445c0f55f46abeb9ULL,
+ 0xcb349e0da4342532ULL, 0xb1eceec59401ac4bULL,
+ 0xfebc9aee5c1e814fULL, 0x8464ea266c2b0836ULL,
+ 0x0b0c7b7e3c7593bdULL, 0x71d40bb60c401ac4ULL,
+ 0xe8a46c1224f5a634ULL, 0x927c1cda14c02f4dULL,
+ 0x1d148d82449eb4c6ULL, 0x67ccfd4a74ab3dbfULL,
+ 0x289c8961bcb410bbULL, 0x5244f9a98c8199c2ULL,
+ 0xdd2c68f1dcdf0249ULL, 0xa7f41839ecea8b30ULL,
+ 0x438c80a64ce15841ULL, 0x3954f06e7cd4d138ULL,
+ 0xb63c61362c8a4ab3ULL, 0xcce411fe1cbfc3caULL,
+ 0x83b465d5d4a0eeceULL, 0xf96c151de49567b7ULL,
+ 0x76048445b4cbfc3cULL, 0x0cdcf48d84fe7545ULL,
+ 0x6fbd6d5ebd3716b7ULL, 0x15651d968d029fceULL,
+ 0x9a0d8ccedd5c0445ULL, 0xe0d5fc06ed698d3cULL,
+ 0xaf85882d2576a038ULL, 0xd55df8e515432941ULL,
+ 0x5a3569bd451db2caULL, 0x20ed197575283bb3ULL,
+ 0xc49581ead523e8c2ULL, 0xbe4df122e51661bbULL,
+ 0x3125607ab548fa30ULL, 0x4bfd10b2857d7349ULL,
+ 0x04ad64994d625e4dULL, 0x7e7514517d57d734ULL,
+ 0xf11d85092d094cbfULL, 0x8bc5f5c11d3cc5c6ULL,
+ 0x12b5926535897936ULL, 0x686de2ad05bcf04fULL,
+ 0xe70573f555e26bc4ULL, 0x9ddd033d65d7e2bdULL,
+ 0xd28d7716adc8cfb9ULL, 0xa85507de9dfd46c0ULL,
+ 0x273d9686cda3dd4bULL, 0x5de5e64efd965432ULL,
+ 0xb99d7ed15d9d8743ULL, 0xc3450e196da80e3aULL,
+ 0x4c2d9f413df695b1ULL, 0x36f5ef890dc31cc8ULL,
+ 0x79a59ba2c5dc31ccULL, 0x037deb6af5e9b8b5ULL,
+ 0x8c157a32a5b7233eULL, 0xf6cd0afa9582aa47ULL,
+ 0x4ad64994d625e4daULL, 0x300e395ce6106da3ULL,
+ 0xbf66a804b64ef628ULL, 0xc5bed8cc867b7f51ULL,
+ 0x8aeeace74e645255ULL, 0xf036dc2f7e51db2cULL,
+ 0x7f5e4d772e0f40a7ULL, 0x05863dbf1e3ac9deULL,
+ 0xe1fea520be311aafULL, 0x9b26d5e88e0493d6ULL,
+ 0x144e44b0de5a085dULL, 0x6e963478ee6f8124ULL,
+ 0x21c640532670ac20ULL, 0x5b1e309b16452559ULL,
+ 0xd476a1c3461bbed2ULL, 0xaeaed10b762e37abULL,
+ 0x37deb6af5e9b8b5bULL, 0x4d06c6676eae0222ULL,
+ 0xc26e573f3ef099a9ULL, 0xb8b627f70ec510d0ULL,
+ 0xf7e653dcc6da3dd4ULL, 0x8d3e2314f6efb4adULL,
+ 0x0256b24ca6b12f26ULL, 0x788ec2849684a65fULL,
+ 0x9cf65a1b368f752eULL, 0xe62e2ad306bafc57ULL,
+ 0x6946bb8b56e467dcULL, 0x139ecb4366d1eea5ULL,
+ 0x5ccebf68aecec3a1ULL, 0x2616cfa09efb4ad8ULL,
+ 0xa97e5ef8cea5d153ULL, 0xd3a62e30fe90582aULL,
+ 0xb0c7b7e3c7593bd8ULL, 0xca1fc72bf76cb2a1ULL,
+ 0x45775673a732292aULL, 0x3faf26bb9707a053ULL,
+ 0x70ff52905f188d57ULL, 0x0a2722586f2d042eULL,
+ 0x854fb3003f739fa5ULL, 0xff97c3c80f4616dcULL,
+ 0x1bef5b57af4dc5adULL, 0x61372b9f9f784cd4ULL,
+ 0xee5fbac7cf26d75fULL, 0x9487ca0fff135e26ULL,
+ 0xdbd7be24370c7322ULL, 0xa10fceec0739fa5bULL,
+ 0x2e675fb4576761d0ULL, 0x54bf2f7c6752e8a9ULL,
+ 0xcdcf48d84fe75459ULL, 0xb71738107fd2dd20ULL,
+ 0x387fa9482f8c46abULL, 0x42a7d9801fb9cfd2ULL,
+ 0x0df7adabd7a6e2d6ULL, 0x772fdd63e7936bafULL,
+ 0xf8474c3bb7cdf024ULL, 0x829f3cf387f8795dULL,
+ 0x66e7a46c27f3aa2cULL, 0x1c3fd4a417c62355ULL,
+ 0x935745fc4798b8deULL, 0xe98f353477ad31a7ULL,
+ 0xa6df411fbfb21ca3ULL, 0xdc0731d78f8795daULL,
+ 0x536fa08fdfd90e51ULL, 0x29b7d047efec8728ULL
+};
+
+static const uint64_t crc64_jones_norm_table[256] = {
+ 0x0000000000000000ULL, 0xad93d23594c935a9ULL,
+ 0xf6b4765ebd5b5efbULL, 0x5b27a46b29926b52ULL,
+ 0x40fb3e88ee7f885fULL, 0xed68ecbd7ab6bdf6ULL,
+ 0xb64f48d65324d6a4ULL, 0x1bdc9ae3c7ede30dULL,
+ 0x81f67d11dcff10beULL, 0x2c65af2448362517ULL,
+ 0x77420b4f61a44e45ULL, 0xdad1d97af56d7becULL,
+ 0xc10d4399328098e1ULL, 0x6c9e91aca649ad48ULL,
+ 0x37b935c78fdbc61aULL, 0x9a2ae7f21b12f3b3ULL,
+ 0xae7f28162d3714d5ULL, 0x03ecfa23b9fe217cULL,
+ 0x58cb5e48906c4a2eULL, 0xf5588c7d04a57f87ULL,
+ 0xee84169ec3489c8aULL, 0x4317c4ab5781a923ULL,
+ 0x183060c07e13c271ULL, 0xb5a3b2f5eadaf7d8ULL,
+ 0x2f895507f1c8046bULL, 0x821a8732650131c2ULL,
+ 0xd93d23594c935a90ULL, 0x74aef16cd85a6f39ULL,
+ 0x6f726b8f1fb78c34ULL, 0xc2e1b9ba8b7eb99dULL,
+ 0x99c61dd1a2ecd2cfULL, 0x3455cfe43625e766ULL,
+ 0xf16d8219cea71c03ULL, 0x5cfe502c5a6e29aaULL,
+ 0x07d9f44773fc42f8ULL, 0xaa4a2672e7357751ULL,
+ 0xb196bc9120d8945cULL, 0x1c056ea4b411a1f5ULL,
+ 0x4722cacf9d83caa7ULL, 0xeab118fa094aff0eULL,
+ 0x709bff0812580cbdULL, 0xdd082d3d86913914ULL,
+ 0x862f8956af035246ULL, 0x2bbc5b633bca67efULL,
+ 0x3060c180fc2784e2ULL, 0x9df313b568eeb14bULL,
+ 0xc6d4b7de417cda19ULL, 0x6b4765ebd5b5efb0ULL,
+ 0x5f12aa0fe39008d6ULL, 0xf281783a77593d7fULL,
+ 0xa9a6dc515ecb562dULL, 0x04350e64ca026384ULL,
+ 0x1fe994870def8089ULL, 0xb27a46b29926b520ULL,
+ 0xe95de2d9b0b4de72ULL, 0x44ce30ec247debdbULL,
+ 0xdee4d71e3f6f1868ULL, 0x7377052baba62dc1ULL,
+ 0x2850a14082344693ULL, 0x85c3737516fd733aULL,
+ 0x9e1fe996d1109037ULL, 0x338c3ba345d9a59eULL,
+ 0x68ab9fc86c4bceccULL, 0xc5384dfdf882fb65ULL,
+ 0x4f48d60609870dafULL, 0xe2db04339d4e3806ULL,
+ 0xb9fca058b4dc5354ULL, 0x146f726d201566fdULL,
+ 0x0fb3e88ee7f885f0ULL, 0xa2203abb7331b059ULL,
+ 0xf9079ed05aa3db0bULL, 0x54944ce5ce6aeea2ULL,
+ 0xcebeab17d5781d11ULL, 0x632d792241b128b8ULL,
+ 0x380add49682343eaULL, 0x95990f7cfcea7643ULL,
+ 0x8e45959f3b07954eULL, 0x23d647aaafcea0e7ULL,
+ 0x78f1e3c1865ccbb5ULL, 0xd56231f41295fe1cULL,
+ 0xe137fe1024b0197aULL, 0x4ca42c25b0792cd3ULL,
+ 0x1783884e99eb4781ULL, 0xba105a7b0d227228ULL,
+ 0xa1ccc098cacf9125ULL, 0x0c5f12ad5e06a48cULL,
+ 0x5778b6c67794cfdeULL, 0xfaeb64f3e35dfa77ULL,
+ 0x60c18301f84f09c4ULL, 0xcd5251346c863c6dULL,
+ 0x9675f55f4514573fULL, 0x3be6276ad1dd6296ULL,
+ 0x203abd891630819bULL, 0x8da96fbc82f9b432ULL,
+ 0xd68ecbd7ab6bdf60ULL, 0x7b1d19e23fa2eac9ULL,
+ 0xbe25541fc72011acULL, 0x13b6862a53e92405ULL,
+ 0x489122417a7b4f57ULL, 0xe502f074eeb27afeULL,
+ 0xfede6a97295f99f3ULL, 0x534db8a2bd96ac5aULL,
+ 0x086a1cc99404c708ULL, 0xa5f9cefc00cdf2a1ULL,
+ 0x3fd3290e1bdf0112ULL, 0x9240fb3b8f1634bbULL,
+ 0xc9675f50a6845fe9ULL, 0x64f48d65324d6a40ULL,
+ 0x7f281786f5a0894dULL, 0xd2bbc5b36169bce4ULL,
+ 0x899c61d848fbd7b6ULL, 0x240fb3eddc32e21fULL,
+ 0x105a7c09ea170579ULL, 0xbdc9ae3c7ede30d0ULL,
+ 0xe6ee0a57574c5b82ULL, 0x4b7dd862c3856e2bULL,
+ 0x50a1428104688d26ULL, 0xfd3290b490a1b88fULL,
+ 0xa61534dfb933d3ddULL, 0x0b86e6ea2dfae674ULL,
+ 0x91ac011836e815c7ULL, 0x3c3fd32da221206eULL,
+ 0x671877468bb34b3cULL, 0xca8ba5731f7a7e95ULL,
+ 0xd1573f90d8979d98ULL, 0x7cc4eda54c5ea831ULL,
+ 0x27e349ce65ccc363ULL, 0x8a709bfbf105f6caULL,
+ 0x9e91ac0c130e1b5eULL, 0x33027e3987c72ef7ULL,
+ 0x6825da52ae5545a5ULL, 0xc5b608673a9c700cULL,
+ 0xde6a9284fd719301ULL, 0x73f940b169b8a6a8ULL,
+ 0x28dee4da402acdfaULL, 0x854d36efd4e3f853ULL,
+ 0x1f67d11dcff10be0ULL, 0xb2f403285b383e49ULL,
+ 0xe9d3a74372aa551bULL, 0x44407576e66360b2ULL,
+ 0x5f9cef95218e83bfULL, 0xf20f3da0b547b616ULL,
+ 0xa92899cb9cd5dd44ULL, 0x04bb4bfe081ce8edULL,
+ 0x30ee841a3e390f8bULL, 0x9d7d562faaf03a22ULL,
+ 0xc65af24483625170ULL, 0x6bc9207117ab64d9ULL,
+ 0x7015ba92d04687d4ULL, 0xdd8668a7448fb27dULL,
+ 0x86a1cccc6d1dd92fULL, 0x2b321ef9f9d4ec86ULL,
+ 0xb118f90be2c61f35ULL, 0x1c8b2b3e760f2a9cULL,
+ 0x47ac8f555f9d41ceULL, 0xea3f5d60cb547467ULL,
+ 0xf1e3c7830cb9976aULL, 0x5c7015b69870a2c3ULL,
+ 0x0757b1ddb1e2c991ULL, 0xaac463e8252bfc38ULL,
+ 0x6ffc2e15dda9075dULL, 0xc26ffc20496032f4ULL,
+ 0x9948584b60f259a6ULL, 0x34db8a7ef43b6c0fULL,
+ 0x2f07109d33d68f02ULL, 0x8294c2a8a71fbaabULL,
+ 0xd9b366c38e8dd1f9ULL, 0x7420b4f61a44e450ULL,
+ 0xee0a5304015617e3ULL, 0x43998131959f224aULL,
+ 0x18be255abc0d4918ULL, 0xb52df76f28c47cb1ULL,
+ 0xaef16d8cef299fbcULL, 0x0362bfb97be0aa15ULL,
+ 0x58451bd25272c147ULL, 0xf5d6c9e7c6bbf4eeULL,
+ 0xc1830603f09e1388ULL, 0x6c10d43664572621ULL,
+ 0x3737705d4dc54d73ULL, 0x9aa4a268d90c78daULL,
+ 0x8178388b1ee19bd7ULL, 0x2cebeabe8a28ae7eULL,
+ 0x77cc4ed5a3bac52cULL, 0xda5f9ce03773f085ULL,
+ 0x40757b122c610336ULL, 0xede6a927b8a8369fULL,
+ 0xb6c10d4c913a5dcdULL, 0x1b52df7905f36864ULL,
+ 0x008e459ac21e8b69ULL, 0xad1d97af56d7bec0ULL,
+ 0xf63a33c47f45d592ULL, 0x5ba9e1f1eb8ce03bULL,
+ 0xd1d97a0a1a8916f1ULL, 0x7c4aa83f8e402358ULL,
+ 0x276d0c54a7d2480aULL, 0x8afede61331b7da3ULL,
+ 0x91224482f4f69eaeULL, 0x3cb196b7603fab07ULL,
+ 0x679632dc49adc055ULL, 0xca05e0e9dd64f5fcULL,
+ 0x502f071bc676064fULL, 0xfdbcd52e52bf33e6ULL,
+ 0xa69b71457b2d58b4ULL, 0x0b08a370efe46d1dULL,
+ 0x10d4399328098e10ULL, 0xbd47eba6bcc0bbb9ULL,
+ 0xe6604fcd9552d0ebULL, 0x4bf39df8019be542ULL,
+ 0x7fa6521c37be0224ULL, 0xd2358029a377378dULL,
+ 0x891224428ae55cdfULL, 0x2481f6771e2c6976ULL,
+ 0x3f5d6c94d9c18a7bULL, 0x92cebea14d08bfd2ULL,
+ 0xc9e91aca649ad480ULL, 0x647ac8fff053e129ULL,
+ 0xfe502f0deb41129aULL, 0x53c3fd387f882733ULL,
+ 0x08e45953561a4c61ULL, 0xa5778b66c2d379c8ULL,
+ 0xbeab1185053e9ac5ULL, 0x1338c3b091f7af6cULL,
+ 0x481f67dbb865c43eULL, 0xe58cb5ee2cacf197ULL,
+ 0x20b4f813d42e0af2ULL, 0x8d272a2640e73f5bULL,
+ 0xd6008e4d69755409ULL, 0x7b935c78fdbc61a0ULL,
+ 0x604fc69b3a5182adULL, 0xcddc14aeae98b704ULL,
+ 0x96fbb0c5870adc56ULL, 0x3b6862f013c3e9ffULL,
+ 0xa142850208d11a4cULL, 0x0cd157379c182fe5ULL,
+ 0x57f6f35cb58a44b7ULL, 0xfa6521692143711eULL,
+ 0xe1b9bb8ae6ae9213ULL, 0x4c2a69bf7267a7baULL,
+ 0x170dcdd45bf5cce8ULL, 0xba9e1fe1cf3cf941ULL,
+ 0x8ecbd005f9191e27ULL, 0x235802306dd02b8eULL,
+ 0x787fa65b444240dcULL, 0xd5ec746ed08b7575ULL,
+ 0xce30ee8d17669678ULL, 0x63a33cb883afa3d1ULL,
+ 0x388498d3aa3dc883ULL, 0x95174ae63ef4fd2aULL,
+ 0x0f3dad1425e60e99ULL, 0xa2ae7f21b12f3b30ULL,
+ 0xf989db4a98bd5062ULL, 0x541a097f0c7465cbULL,
+ 0x4fc6939ccb9986c6ULL, 0xe25541a95f50b36fULL,
+ 0xb972e5c276c2d83dULL, 0x14e137f7e20bed94ULL
+};
+
+uint64_t crc64_ecma_refl_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t i, crc = ~seed;
+
+ for (i = 0; i < len; i++) {
+ uint8_t byte = buf[i];
+ crc = crc64_ecma_refl_table[(uint8_t) crc ^ byte] ^ (crc >> 8);
+ }
+
+ return ~crc;
+}
+
+uint64_t crc64_ecma_norm_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t i, crc = ~seed;
+
+ for (i = 0; i < len; i++) {
+ uint8_t byte = buf[i];
+ crc = crc64_ecma_norm_table[((crc >> 56) ^ byte) & 0xff] ^ (crc << 8);
+ }
+
+ return ~crc;
+}
+
+uint64_t crc64_iso_refl_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t i, crc = ~seed;
+
+ for (i = 0; i < len; i++) {
+ uint8_t byte = buf[i];
+ crc = crc64_iso_refl_table[(uint8_t) crc ^ byte] ^ (crc >> 8);
+ }
+
+ return ~crc;
+}
+
+uint64_t crc64_iso_norm_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t i, crc = ~seed;
+
+ for (i = 0; i < len; i++) {
+ uint8_t byte = buf[i];
+ crc = crc64_iso_norm_table[((crc >> 56) ^ byte) & 0xff] ^ (crc << 8);
+ }
+
+ return ~crc;
+}
+
+uint64_t crc64_jones_refl_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t i, crc = ~seed;
+
+ for (i = 0; i < len; i++) {
+ uint8_t byte = buf[i];
+ crc = crc64_jones_refl_table[(uint8_t) crc ^ byte] ^ (crc >> 8);
+ }
+
+ return ~crc;
+}
+
+uint64_t crc64_jones_norm_base(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t i, crc = ~seed;
+
+ for (i = 0; i < len; i++) {
+ uint8_t byte = buf[i];
+ crc = crc64_jones_norm_table[((crc >> 56) ^ byte) & 0xff] ^ (crc << 8);
+ }
+
+ return ~crc;
+}
+
+struct slver {
+ unsigned short snum;
+ unsigned char ver;
+ unsigned char core;
+};
+
+struct slver crc64_ecma_refl_base_slver_0000001c;
+struct slver crc64_ecma_refl_base_slver = { 0x001c, 0x00, 0x00 };
+
+struct slver crc64_ecma_norm_base_slver_00000019;
+struct slver crc64_ecma_norm_base_slver = { 0x0019, 0x00, 0x00 };
+
+struct slver crc64_iso_refl_base_slver_00000022;
+struct slver crc64_iso_refl_base_slver = { 0x0022, 0x00, 0x00 };
+
+struct slver crc64_iso_norm_base_slver_0000001f;
+struct slver crc64_iso_norm_base_slver = { 0x001f, 0x00, 0x00 };
+
+struct slver crc64_jones_refl_base_slver_00000028;
+struct slver crc64_jones_refl_base_slver = { 0x0028, 0x00, 0x00 };
+
+struct slver crc64_jones_norm_base_slver_00000025;
+struct slver crc64_jones_norm_base_slver = { 0x0025, 0x00, 0x00 };
diff --git a/src/isa-l/crc/crc64_ecma_norm_by16_10.asm b/src/isa-l/crc/crc64_ecma_norm_by16_10.asm
new file mode 100644
index 000000000..8b09a89c4
--- /dev/null
+++ b/src/isa-l/crc/crc64_ecma_norm_by16_10.asm
@@ -0,0 +1,61 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNCTION_NAME crc64_ecma_norm_by16_10
+%define USE_CONSTS
+%macro INCLUDE_CONSTS 0
+rk_1: dq 0x7f52691a60ddc70d
+rk_2: dq 0x7036b0389f6a0c82
+rk1: dq 0x05f5c3c7eb52fab6
+rk2: dq 0x4eb938a7d257740e
+rk3: dq 0x05cf79dea9ac37d6
+rk4: dq 0x001067e571d7d5c2
+rk5: dq 0x05f5c3c7eb52fab6
+rk6: dq 0x0000000000000000
+rk7: dq 0x578d29d06cc4f872
+rk8: dq 0x42f0e1eba9ea3693
+rk9: dq 0xe464f4df5fb60ac1
+rk10: dq 0xb649c5b35a759cf2
+rk11: dq 0x9af04e1eff82d0dd
+rk12: dq 0x6e82e609297f8fe8
+rk13: dq 0x097c516e98bd2e73
+rk14: dq 0x0b76477b31e22e7b
+rk15: dq 0x5f6843ca540df020
+rk16: dq 0xddf4b6981205b83f
+rk17: dq 0x54819d8713758b2c
+rk18: dq 0x4a6b90073eb0af5a
+rk19: dq 0x571bee0a227ef92b
+rk20: dq 0x44bef2a201b5200c
+rk_1b: dq 0x05f5c3c7eb52fab6
+rk_2b: dq 0x4eb938a7d257740e
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+%endm
+
+%include "crc64_iso_norm_by16_10.asm"
diff --git a/src/isa-l/crc/crc64_ecma_norm_by8.asm b/src/isa-l/crc/crc64_ecma_norm_by8.asm
new file mode 100644
index 000000000..ca99e344a
--- /dev/null
+++ b/src/isa-l/crc/crc64_ecma_norm_by8.asm
@@ -0,0 +1,584 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; uint64_t crc64_ecma_norm_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+align 16
+mk_global crc64_ecma_norm_by8, function
+crc64_ecma_norm_by8:
+ endbranch
+
+ not arg1 ;~init_crc
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 8
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x01 ; H*L
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+
+ movdqa xmm1, xmm7
+ pand xmm1, [mask3]
+ pclmulqdq xmm7, xmm10, 0x01
+ pxor xmm7, xmm1
+
+ pclmulqdq xmm7, xmm10, 0x11
+ pxor xmm7, xmm0
+ pextrq rax, xmm7, 0
+
+_cleanup:
+ not rax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Right shift (8-length) bytes in XMM
+ add rax, 8
+ movdqu xmm0, [rax]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+
+rk1 :
+DQ 0x5f5c3c7eb52fab6
+rk2 :
+DQ 0x4eb938a7d257740e
+rk3 :
+DQ 0x5cf79dea9ac37d6
+rk4 :
+DQ 0x001067e571d7d5c2
+rk5 :
+DQ 0x5f5c3c7eb52fab6
+rk6 :
+DQ 0x0000000000000000
+rk7 :
+DQ 0x578d29d06cc4f872
+rk8 :
+DQ 0x42f0e1eba9ea3693
+rk9 :
+DQ 0xe464f4df5fb60ac1
+rk10 :
+DQ 0xb649c5b35a759cf2
+rk11 :
+DQ 0x9af04e1eff82d0dd
+rk12 :
+DQ 0x6e82e609297f8fe8
+rk13 :
+DQ 0x97c516e98bd2e73
+rk14 :
+DQ 0xb76477b31e22e7b
+rk15 :
+DQ 0x5f6843ca540df020
+rk16 :
+DQ 0xddf4b6981205b83f
+rk17 :
+DQ 0x54819d8713758b2c
+rk18 :
+DQ 0x4a6b90073eb0af5a
+rk19 :
+DQ 0x571bee0a227ef92b
+rk20 :
+DQ 0x44bef2a201b5200c
+
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+mask3:
+dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_ecma_norm_by8, 01, 00, 001a
diff --git a/src/isa-l/crc/crc64_ecma_refl_by16_10.asm b/src/isa-l/crc/crc64_ecma_refl_by16_10.asm
new file mode 100644
index 000000000..a48d0b203
--- /dev/null
+++ b/src/isa-l/crc/crc64_ecma_refl_by16_10.asm
@@ -0,0 +1,61 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNCTION_NAME crc64_ecma_refl_by16_10
+%define USE_CONSTS
+%macro INCLUDE_CONSTS 0
+rk_1: dq 0xf31fd9271e228b79
+rk_2: dq 0x8260adf2381ad81c
+rk1: dq 0xdabe95afc7875f40
+rk2: dq 0xe05dd497ca393ae4
+rk3: dq 0xd7d86b2af73de740
+rk4: dq 0x8757d71d4fcc1000
+rk5: dq 0xdabe95afc7875f40
+rk6: dq 0x0000000000000000
+rk7: dq 0x9c3e466c172963d5
+rk8: dq 0x92d8af2baf0e1e84
+rk9: dq 0x947874de595052cb
+rk10: dq 0x9e735cb59b4724da
+rk11: dq 0xe4ce2cd55fea0037
+rk12: dq 0x2fe3fd2920ce82ec
+rk13: dq 0x0e31d519421a63a5
+rk14: dq 0x2e30203212cac325
+rk15: dq 0x081f6054a7842df4
+rk16: dq 0x6ae3efbb9dd441f3
+rk17: dq 0x69a35d91c3730254
+rk18: dq 0xb5ea1af9c013aca4
+rk19: dq 0x3be653a30fe1af51
+rk20: dq 0x60095b008a9efa44
+rk_1b: dq 0xdabe95afc7875f40
+rk_2b: dq 0xe05dd497ca393ae4
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+%endm
+
+%include "crc64_iso_refl_by16_10.asm"
diff --git a/src/isa-l/crc/crc64_ecma_refl_by8.asm b/src/isa-l/crc/crc64_ecma_refl_by8.asm
new file mode 100644
index 000000000..c09ddfa4f
--- /dev/null
+++ b/src/isa-l/crc/crc64_ecma_refl_by8.asm
@@ -0,0 +1,549 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; uint64_t crc64_ecma_refl_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+; sample yasm command line:
+; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+
+align 16
+mk_global crc64_ecma_refl_by8, function
+crc64_ecma_refl_by8:
+ endbranch
+ ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
+ not arg1
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm1, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm3, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm5, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm7, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+ ; xmm0 to xmm7
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+ ;xmm1 to xmm7
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+ ; xmm6 to xmm7
+ movdqa xmm10, [rk1]
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ add arg3, 16
+ je _128_done
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+
+
+ movdqa xmm2, xmm7
+ movdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ movdqu xmm0, [rax]
+
+
+ pshufb xmm7, xmm0
+ pxor xmm0, [mask3]
+ pshufb xmm2, xmm0
+
+ pblendvb xmm2, xmm1 ;xmm0 is implicit
+ ;;;;;;;;;;
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5]
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0
+ psrldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm1, xmm7
+ movdqa xmm10, [rk7]
+
+ pclmulqdq xmm7, xmm10, 0
+ movdqa xmm2, xmm7
+ pclmulqdq xmm7, xmm10, 0x10
+ pslldq xmm2, 8
+ pxor xmm7, xmm2
+ pxor xmm7, xmm1
+ pextrq rax, xmm7, 1
+
+_cleanup:
+ ; return c ^ 0xffffffff, ffffffffL;
+ not rax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movq xmm0, arg1 ; get the initial crc value
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax + r9]
+ pshufb xmm7,xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Left shift (8-length) bytes in XMM
+ movdqu xmm0, [rax + r9 + 8]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+; rk7 = floor(2^128/Q)
+; rk8 = Q
+rk1 :
+DQ 0xdabe95afc7875f40
+rk2 :
+DQ 0xe05dd497ca393ae4
+rk3 :
+DQ 0xd7d86b2af73de740
+rk4 :
+DQ 0x8757d71d4fcc1000
+rk5 :
+DQ 0xdabe95afc7875f40
+rk6 :
+DQ 0x0000000000000000
+rk7 :
+DQ 0x9c3e466c172963d5
+rk8 :
+DQ 0x92d8af2baf0e1e84
+rk9 :
+DQ 0x947874de595052cb
+rk10 :
+DQ 0x9e735cb59b4724da
+rk11 :
+DQ 0xe4ce2cd55fea0037
+rk12 :
+DQ 0x2fe3fd2920ce82ec
+rk13 :
+DQ 0xe31d519421a63a5
+rk14 :
+DQ 0x2e30203212cac325
+rk15 :
+DQ 0x81f6054a7842df4
+rk16 :
+DQ 0x6ae3efbb9dd441f3
+rk17 :
+DQ 0x69a35d91c3730254
+rk18 :
+DQ 0xb5ea1af9c013aca4
+rk19 :
+DQ 0x3be653a30fe1af51
+rk20 :
+DQ 0x60095b008a9efa44
+
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+
+mask:
+dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2:
+dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3:
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_ecma_refl_by8, 01, 00, 001d
diff --git a/src/isa-l/crc/crc64_example.c b/src/isa-l/crc/crc64_example.c
new file mode 100644
index 000000000..64763a1b0
--- /dev/null
+++ b/src/isa-l/crc/crc64_example.c
@@ -0,0 +1,68 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "crc64.h"
+
+#define BUF_SIZE 8192
+#define INIT_SEED 0x12345678
+
+int main(int argc, char *argv[])
+{
+ uint8_t inbuf[BUF_SIZE];
+ uint64_t avail_in, total_in = 0;
+ uint64_t crc64_checksum;
+ FILE *in;
+
+ if (argc != 2) {
+ fprintf(stderr, "Usage: crc64_example infile\n");
+ exit(0);
+ }
+ in = fopen(argv[1], "rb");
+ if (!in) {
+ fprintf(stderr, "Can't open %s for reading\n", argv[1]);
+ exit(0);
+ }
+
+ printf("crc64_example -- crc64_ecma_refl:\n");
+ fflush(0);
+
+ crc64_checksum = INIT_SEED;
+ while ((avail_in = fread(inbuf, 1, BUF_SIZE, in))) {
+ // crc update mode
+ crc64_checksum = crc64_ecma_refl(crc64_checksum, inbuf, avail_in);
+ total_in += avail_in;
+ }
+
+ fclose(in);
+ printf("total length is %ld, checksum is 0x%lx\n", total_in, crc64_checksum);
+
+ return 0;
+}
diff --git a/src/isa-l/crc/crc64_funcs_perf.c b/src/isa-l/crc/crc64_funcs_perf.c
new file mode 100644
index 000000000..4ad1cc199
--- /dev/null
+++ b/src/isa-l/crc/crc64_funcs_perf.c
@@ -0,0 +1,103 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include "crc64.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define TEST_MEM TEST_LEN
+
+typedef uint64_t(*crc64_func_t) (uint64_t, const uint8_t *, uint64_t);
+
+typedef struct func_case {
+ char *note;
+ crc64_func_t crc64_func_call;
+ crc64_func_t crc64_ref_call;
+} func_case_t;
+
+func_case_t test_funcs[] = {
+ {"crc64_ecma_norm", crc64_ecma_norm, crc64_ecma_norm_base},
+ {"crc64_ecma_refl", crc64_ecma_refl, crc64_ecma_refl_base},
+ {"crc64_iso_norm", crc64_iso_norm, crc64_iso_norm_base},
+ {"crc64_iso_refl", crc64_iso_refl, crc64_iso_refl_base},
+ {"crc64_jones_norm", crc64_jones_norm, crc64_jones_norm_base},
+ {"crc64_jones_refl", crc64_jones_refl, crc64_jones_refl_base}
+};
+
+int main(int argc, char *argv[])
+{
+ int j;
+ void *buf;
+ uint64_t crc;
+ struct perf start;
+ func_case_t *test_func;
+
+ if (posix_memalign(&buf, 1024, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ memset(buf, (char)TEST_SEED, TEST_LEN);
+
+ for (j = 0; j < sizeof(test_funcs) / sizeof(test_funcs[0]); j++) {
+ test_func = &test_funcs[j];
+ printf("%s_perf:\n", test_func->note);
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ BENCHMARK(&start, BENCHMARK_TIME, crc =
+ test_func->crc64_func_call(TEST_SEED, buf, TEST_LEN));
+ printf("%s" TEST_TYPE_STR ": ", test_func->note);
+ perf_print(start, (long long)TEST_LEN);
+
+ printf("finish 0x%lx\n", crc);
+ }
+
+ return 0;
+}
diff --git a/src/isa-l/crc/crc64_funcs_test.c b/src/isa-l/crc/crc64_funcs_test.c
new file mode 100644
index 000000000..7e4ee2b37
--- /dev/null
+++ b/src/isa-l/crc/crc64_funcs_test.c
@@ -0,0 +1,315 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "crc64.h"
+#include "types.h"
+#include "crc64_ref.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define MAX_BUF 4096
+#define TEST_SIZE 32
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+typedef uint64_t(*crc64_func_t) (uint64_t, const uint8_t *, uint64_t);
+
+typedef struct func_case {
+ char *note;
+ crc64_func_t crc64_func_call;
+ crc64_func_t crc64_base_call;
+ crc64_func_t crc64_ref_call;
+} func_case_t;
+
+func_case_t test_funcs[] = {
+ {"crc64_ecma_norm", crc64_ecma_norm, crc64_ecma_norm_base, crc64_ecma_norm_ref},
+ {"crc64_ecma_refl", crc64_ecma_refl, crc64_ecma_refl_base, crc64_ecma_refl_ref},
+ {"crc64_iso_norm", crc64_iso_norm, crc64_iso_norm_base, crc64_iso_norm_ref},
+ {"crc64_iso_refl", crc64_iso_refl, crc64_iso_refl_base, crc64_iso_refl_ref},
+ {"crc64_jones_norm", crc64_jones_norm, crc64_jones_norm_base,
+ crc64_jones_norm_ref},
+ {"crc64_jones_refl", crc64_jones_refl, crc64_jones_refl_base, crc64_jones_refl_ref}
+};
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+// Test cases
+int zeros_test(func_case_t * test_func);
+
+int simple_pattern_test(func_case_t * test_func);
+
+int seeds_sizes_test(func_case_t * test_func);
+
+int eob_test(func_case_t * test_func);
+
+int update_test(func_case_t * test_func);
+
+int verbose = 0;
+void *buf_alloc = NULL;
+
+int main(int argc, char *argv[])
+{
+ int fail = 0, fail_case;
+ int i, ret;
+ func_case_t *test_func;
+
+ verbose = argc - 1;
+
+ // Align to 32B boundary
+ ret = posix_memalign(&buf_alloc, TEST_SIZE, MAX_BUF * TEST_SIZE);
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ srand(TEST_SEED);
+ printf("CRC64 Tests\n");
+
+ for (i = 0; i < sizeof(test_funcs) / sizeof(test_funcs[0]); i++) {
+ fail_case = 0;
+ test_func = &test_funcs[i];
+
+ printf("Test %s\t", test_func->note);
+ fail_case += zeros_test(test_func);
+ fail_case += simple_pattern_test(test_func);
+ fail_case += seeds_sizes_test(test_func);
+ fail_case += eob_test(test_func);
+ fail_case += update_test(test_func);
+ printf(" done: %s\n", fail_case ? "Fail" : "Pass");
+
+ if (fail_case) {
+ printf("\n%s Failed %d tests\n", test_func->note, fail_case);
+ fail++;
+ }
+ }
+
+ printf("CRC64 Tests all done: %s\n", fail ? "Fail" : "Pass");
+
+ return fail;
+}
+
+// Test of all zeros
+int zeros_test(func_case_t * test_func)
+{
+ uint64_t crc_ref, crc_base, crc;
+ int fail = 0;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc;
+ memset(buf, 0, MAX_BUF * 10);
+ crc_ref = test_func->crc64_ref_call(TEST_SEED, buf, MAX_BUF * 10);
+ crc_base = test_func->crc64_base_call(TEST_SEED, buf, MAX_BUF * 10);
+ crc = test_func->crc64_func_call(TEST_SEED, buf, MAX_BUF * 10);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref)) {
+ fail++;
+ printf("\n opt ref\n");
+ printf(" ------ ------\n");
+ printf("crc zero = 0x%16lx 0x%16lx 0x%16lx \n", crc_ref, crc_base, crc);
+ } else
+ printf(".");
+
+ return fail;
+}
+
+// Another simple test pattern
+int simple_pattern_test(func_case_t * test_func)
+{
+ uint64_t crc_ref, crc_base, crc;
+ int fail = 0;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc;
+ memset(buf, 0x8a, MAX_BUF);
+ crc_ref = test_func->crc64_ref_call(TEST_SEED, buf, MAX_BUF);
+ crc_base = test_func->crc64_base_call(TEST_SEED, buf, MAX_BUF);
+ crc = test_func->crc64_func_call(TEST_SEED, buf, MAX_BUF);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc all 8a = 0x%16lx 0x%16lx 0x%16lx\n", crc_ref, crc_base, crc);
+ else
+ printf(".");
+
+ return fail;
+}
+
+int seeds_sizes_test(func_case_t * test_func)
+{
+ uint64_t crc_ref, crc_base, crc;
+ int fail = 0;
+ int i;
+ uint64_t r, s;
+ unsigned char *buf = NULL;
+
+ // Do a few random tests
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+ rand_buffer(buf, MAX_BUF * TEST_SIZE);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc_ref = test_func->crc64_ref_call(r, buf, MAX_BUF);
+ crc_base = test_func->crc64_base_call(r, buf, MAX_BUF);
+ crc = test_func->crc64_func_call(r, buf, MAX_BUF);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%16lx 0x%16lx 0x%16lx\n", i, crc_ref, crc_base,
+ crc);
+ else if (i % (TEST_SIZE / 8) == 0)
+ printf(".");
+ buf += MAX_BUF;
+ }
+
+ // Do a few random sizes
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+
+ for (i = MAX_BUF; i >= 0; i--) {
+ crc_ref = test_func->crc64_ref_call(r, buf, i);
+ crc_base = test_func->crc64_base_call(r, buf, i);
+ crc = test_func->crc64_func_call(r, buf, i);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref)) {
+ fail++;
+ printf("fail random size%i 0x%16lx 0x%16lx 0x%16lx\n", i, crc_ref,
+ crc_base, crc);
+ } else if (i % (MAX_BUF / 8) == 0)
+ printf(".");
+ }
+
+ // Try different seeds
+ for (s = 0; s < 20; s++) {
+ buf = (unsigned char *)buf_alloc; //reset buf
+
+ r = rand(); // just to get a new seed
+ rand_buffer(buf, MAX_BUF * TEST_SIZE); // new pseudo-rand data
+
+ if (verbose)
+ printf("seed = 0x%lx\n", r);
+
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc_ref = test_func->crc64_ref_call(r, buf, MAX_BUF);
+ crc_base = test_func->crc64_base_call(r, buf, MAX_BUF);
+ crc = test_func->crc64_func_call(r, buf, MAX_BUF);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%16lx 0x%16lx 0x%16lx\n", i, crc_ref,
+ crc_base, crc);
+ else if (i % (TEST_SIZE * 20 / 8) == 0)
+ printf(".");
+ buf += MAX_BUF;
+ }
+ }
+
+ return fail;
+}
+
+// Run tests at end of buffer
+int eob_test(func_case_t * test_func)
+{
+ uint64_t crc_ref, crc_base, crc;
+ int fail = 0;
+ int i;
+ unsigned char *buf = NULL;
+
+ // Null test
+ if (0 != test_func->crc64_func_call(0, NULL, 0)) {
+ fail++;
+ printf("crc null test fail\n");
+ }
+
+ buf = (unsigned char *)buf_alloc; //reset buf
+ buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end
+ for (i = 0; i <= TEST_SIZE; i++) {
+ crc_ref = test_func->crc64_ref_call(TEST_SEED, buf + i, TEST_SIZE - i);
+ crc_base = test_func->crc64_base_call(TEST_SEED, buf + i, TEST_SIZE - i);
+ crc = test_func->crc64_func_call(TEST_SEED, buf + i, TEST_SIZE - i);
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc eob rand%3d = 0x%16lx 0x%16lx 0x%16lx\n", i, crc_ref,
+ crc_base, crc);
+ else if (i % (TEST_SIZE / 8) == 0)
+ printf(".");
+ }
+
+ return fail;
+}
+
+int update_test(func_case_t * test_func)
+{
+ uint64_t crc_ref, crc_base, crc;
+ int fail = 0;
+ int i;
+ uint64_t r;
+ unsigned char *buf = NULL;
+
+ buf = (unsigned char *)buf_alloc; //reset buf
+ r = rand();
+ // Process the whole buf with reference func single call.
+ crc_ref = test_func->crc64_ref_call(r, buf, MAX_BUF * TEST_SIZE);
+ crc_base = test_func->crc64_base_call(r, buf, MAX_BUF * TEST_SIZE);
+ // Process buf with update method.
+ for (i = 0; i < TEST_SIZE; i++) {
+ crc = test_func->crc64_func_call(r, buf, MAX_BUF);
+ // Update crc seeds and buf pointer.
+ r = crc;
+ buf += MAX_BUF;
+ }
+
+ if ((crc_base != crc_ref) || (crc != crc_ref))
+ fail++;
+ if (verbose)
+ printf("crc rand%3d = 0x%16lx 0x%16lx 0x%16lx\n", i, crc_ref, crc_base, crc);
+ else
+ printf(".");
+
+ return fail;
+}
diff --git a/src/isa-l/crc/crc64_iso_norm_by16_10.asm b/src/isa-l/crc/crc64_iso_norm_by16_10.asm
new file mode 100644
index 000000000..4eefbd35e
--- /dev/null
+++ b/src/isa-l/crc/crc64_iso_norm_by16_10.asm
@@ -0,0 +1,525 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; uint64_t crc64_iso_norm_by16_10(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%ifndef FUNCTION_NAME
+%define FUNCTION_NAME crc64_iso_norm_by16_10
+%endif
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*12+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global FUNCTION_NAME, function
+FUNCTION_NAME:
+ endbranch
+ not arg1
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
+%endif
+ vbroadcasti32x4 zmm18, [SHUF_MASK]
+ cmp arg3, 256
+ jl _less_than_256
+
+ ; load the initial crc value
+ vmovq xmm10, arg1 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ vpslldq xmm10, 8
+
+ ; receive the initial 128B data, xor the initial crc value
+ vmovdqu8 zmm0, [arg2+16*0]
+ vmovdqu8 zmm4, [arg2+16*4]
+ vpshufb zmm0, zmm0, zmm18
+ vpshufb zmm4, zmm4, zmm18
+ vpxorq zmm0, zmm10
+ vbroadcasti32x4 zmm10, [rk3] ;zmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ sub arg3, 256
+ cmp arg3, 256
+ jl _fold_128_B_loop
+
+ vmovdqu8 zmm7, [arg2+16*8]
+ vmovdqu8 zmm8, [arg2+16*12]
+ vpshufb zmm7, zmm7, zmm18
+ vpshufb zmm8, zmm8, zmm18
+ vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
+ sub arg3, 256
+
+_fold_256_B_loop:
+ add arg2, 256
+ vmovdqu8 zmm3, [arg2+16*0]
+ vpshufb zmm3, zmm3, zmm18
+ vpclmulqdq zmm1, zmm0, zmm16, 0x00
+ vpclmulqdq zmm2, zmm0, zmm16, 0x11
+ vpxorq zmm0, zmm1, zmm2
+ vpxorq zmm0, zmm0, zmm3
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpshufb zmm9, zmm9, zmm18
+ vpclmulqdq zmm5, zmm4, zmm16, 0x00
+ vpclmulqdq zmm6, zmm4, zmm16, 0x11
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ vmovdqu8 zmm11, [arg2+16*8]
+ vpshufb zmm11, zmm11, zmm18
+ vpclmulqdq zmm12, zmm7, zmm16, 0x00
+ vpclmulqdq zmm13, zmm7, zmm16, 0x11
+ vpxorq zmm7, zmm12, zmm13
+ vpxorq zmm7, zmm7, zmm11
+
+ vmovdqu8 zmm17, [arg2+16*12]
+ vpshufb zmm17, zmm17, zmm18
+ vpclmulqdq zmm14, zmm8, zmm16, 0x00
+ vpclmulqdq zmm15, zmm8, zmm16, 0x11
+ vpxorq zmm8, zmm14, zmm15
+ vpxorq zmm8, zmm8, zmm17
+
+ sub arg3, 256
+ jge _fold_256_B_loop
+
+ ;; Fold 256 into 128
+ add arg2, 256
+ vpclmulqdq zmm1, zmm0, zmm10, 0x00
+ vpclmulqdq zmm2, zmm0, zmm10, 0x11
+ vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
+
+ vpclmulqdq zmm5, zmm4, zmm10, 0x00
+ vpclmulqdq zmm6, zmm4, zmm10, 0x11
+ vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
+
+ vmovdqa32 zmm0, zmm7
+ vmovdqa32 zmm4, zmm8
+
+ add arg3, 128
+ jmp _fold_128_B_register
+
+ ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
+_fold_128_B_loop:
+ add arg2, 128 ; update the buffer pointer
+ vmovdqu8 zmm8, [arg2+16*0]
+ vpshufb zmm8, zmm8, zmm18
+ vpclmulqdq zmm1, zmm0, zmm10, 0x00
+ vpclmulqdq zmm2, zmm0, zmm10, 0x11
+ vpxorq zmm0, zmm1, zmm2
+ vpxorq zmm0, zmm0, zmm8
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpshufb zmm9, zmm9, zmm18
+ vpclmulqdq zmm5, zmm4, zmm10, 0x00
+ vpclmulqdq zmm6, zmm4, zmm10, 0x11
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+ sub arg3, 128
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
+
+_fold_128_B_register:
+ ; fold the 8 128b parts into 1 xmm register with different constants
+ vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
+ vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
+ vpclmulqdq zmm1, zmm0, zmm16, 0x00
+ vpclmulqdq zmm2, zmm0, zmm16, 0x11
+ vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
+
+ vpclmulqdq zmm5, zmm4, zmm11, 0x00
+ vpclmulqdq zmm6, zmm4, zmm11, 0x11
+ vmovdqa xmm10, [rk1] ; Needed later in reduction loop
+ vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
+ vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
+
+ vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
+ vpxorq ymm8, ymm8, ymm1
+ vextracti64x2 xmm5, ymm8, 1
+ vpxorq xmm7, xmm5, xmm8
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ vmovdqa xmm8, xmm7
+ vpclmulqdq xmm7, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x00
+ vpxor xmm7, xmm8
+ vmovdqu xmm0, [arg2]
+ vpshufb xmm0, xmm0, xmm18
+ vpxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ add arg3, 16
+ je _128_done
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset
+ ; the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+
+ vmovdqa xmm2, xmm7
+ vmovdqu xmm1, [arg2 - 16 + arg3]
+ vpshufb xmm1, xmm18
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ vmovdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ vpshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ vpxor xmm0, [mask1]
+ vpshufb xmm7, xmm0
+ vpblendvb xmm1, xmm1, xmm2, xmm0
+
+ ; fold 16 Bytes
+ vmovdqa xmm2, xmm1
+ vmovdqa xmm8, xmm7
+ vpclmulqdq xmm7, xmm10, 0x11
+ vpclmulqdq xmm8, xmm10, 0x0
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm10, [rk5]
+ vmovdqa xmm0, xmm7
+
+ ;64b fold
+ vpclmulqdq xmm7, xmm10, 0x01 ; H*L
+ vpslldq xmm0, 8
+ vpxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ vmovdqa xmm0, xmm7
+
+ vmovdqa xmm1, xmm7
+ vpand xmm1, [mask3]
+ vpclmulqdq xmm7, xmm10, 0x01
+ vpxor xmm7, xmm1
+
+ vpclmulqdq xmm7, xmm10, 0x11
+ vpxor xmm7, xmm0
+ vpextrq rax, xmm7, 0
+
+_cleanup:
+ not rax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+
+ ; if there is, load the constants
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ vmovq xmm0, arg1 ; get the initial crc value
+ vpslldq xmm0, 8 ; align it to its correct place
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm18 ; byte-reflect the plaintext
+ vpxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ vmovq xmm0, arg1 ; get the initial crc value
+ vpslldq xmm0, 8 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpshufb xmm7, xmm18 ; byte-reflect the plaintext
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ vmovdqa xmm7, [rsp]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ vmovdqu xmm0, [rax]
+ vpxor xmm0, [mask1]
+
+ vpshufb xmm7, xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Right shift (8-length) bytes in XMM
+ add rax, 8
+ vmovdqu xmm0, [rax]
+ vpshufb xmm7,xmm0
+
+ jmp _barrett
+
+align 16
+_exact_16_left:
+ vmovdqu xmm7, [arg2]
+ vpshufb xmm7, xmm18
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+align 32
+
+%ifndef USE_CONSTS
+; precomputed constants
+rk_1: dq 0x0000001a00000144
+rk_2: dq 0x0000015e00001dac
+rk1: dq 0x0000000000000145
+rk2: dq 0x0000000000001db7
+rk3: dq 0x000100000001001a
+rk4: dq 0x001b0000001b015e
+rk5: dq 0x0000000000000145
+rk6: dq 0x0000000000000000
+rk7: dq 0x000000000000001b
+rk8: dq 0x000000000000001b
+rk9: dq 0x0150145145145015
+rk10: dq 0x1c71db6db6db71c7
+rk11: dq 0x0001110110110111
+rk12: dq 0x001aab1ab1ab1aab
+rk13: dq 0x0000014445014445
+rk14: dq 0x00001daab71daab7
+rk15: dq 0x0000000101000101
+rk16: dq 0x0000001b1b001b1b
+rk17: dq 0x0000000001514515
+rk18: dq 0x000000001c6db6c7
+rk19: dq 0x0000000000011011
+rk20: dq 0x00000000001ab1ab
+
+rk_1b: dq 0x0000000000000145
+rk_2b: dq 0x0000000000001db7
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+%else
+INCLUDE_CONSTS
+%endif
+
+mask1: dq 0x8080808080808080, 0x8080808080808080
+mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+mask3: dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
+
+SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x8080808080808080
+
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_ %+ FUNCTION_NAME
+no_ %+ FUNCTION_NAME %+ :
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/isa-l/crc/crc64_iso_norm_by8.asm b/src/isa-l/crc/crc64_iso_norm_by8.asm
new file mode 100644
index 000000000..16147d5ff
--- /dev/null
+++ b/src/isa-l/crc/crc64_iso_norm_by8.asm
@@ -0,0 +1,582 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; uint64_t crc64_iso_norm_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+align 16
+mk_global crc64_iso_norm_by8, function
+crc64_iso_norm_by8:
+ endbranch
+
+ not arg1 ;~init_crc
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 8
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x01 ; H*L
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+
+ movdqa xmm1, xmm7
+ pand xmm1, [mask3]
+ pclmulqdq xmm7, xmm10, 0x01
+ pxor xmm7, xmm1
+
+ pclmulqdq xmm7, xmm10, 0x11
+ pxor xmm7, xmm0
+ pextrq rax, xmm7, 0
+
+_cleanup:
+ not rax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Right shift (8-length) bytes in XMM
+ add rax, 8
+ movdqu xmm0, [rax]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+
+rk1:
+DQ 0x0000000000000145
+rk2:
+DQ 0x0000000000001db7
+rk3:
+DQ 0x000100000001001a
+rk4:
+DQ 0x001b0000001b015e
+rk5:
+DQ 0x0000000000000145
+rk6:
+DQ 0x0000000000000000
+rk7:
+DQ 0x000000000000001b
+rk8:
+DQ 0x000000000000001b
+rk9:
+DQ 0x0150145145145015
+rk10:
+DQ 0x1c71db6db6db71c7
+rk11:
+DQ 0x0001110110110111
+rk12:
+DQ 0x001aab1ab1ab1aab
+rk13:
+DQ 0x0000014445014445
+rk14:
+DQ 0x00001daab71daab7
+rk15:
+DQ 0x0000000101000101
+rk16:
+DQ 0x0000001b1b001b1b
+rk17:
+DQ 0x0000000001514515
+rk18:
+DQ 0x000000001c6db6c7
+rk19:
+DQ 0x0000000000011011
+rk20:
+DQ 0x00000000001ab1ab
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+mask3:
+dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_iso_norm_by8, 01, 00, 0020
diff --git a/src/isa-l/crc/crc64_iso_refl_by16_10.asm b/src/isa-l/crc/crc64_iso_refl_by16_10.asm
new file mode 100644
index 000000000..e5d5a08fe
--- /dev/null
+++ b/src/isa-l/crc/crc64_iso_refl_by16_10.asm
@@ -0,0 +1,495 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; uint64_t crc64_iso_refl_by16_10(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%ifndef FUNCTION_NAME
+%define FUNCTION_NAME crc64_iso_refl_by16_10
+%endif
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*12+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+align 16
+mk_global FUNCTION_NAME, function
+FUNCTION_NAME:
+ endbranch
+ not arg1
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
+%endif
+
+ cmp arg3, 256
+ jl _less_than_256
+
+ ; load the initial crc value
+ vmovq xmm10, arg1 ; initial crc
+
+ ; receive the initial 128B data, xor the initial crc value
+ vmovdqu8 zmm0, [arg2+16*0]
+ vmovdqu8 zmm4, [arg2+16*4]
+ vpxorq zmm0, zmm10
+ vbroadcasti32x4 zmm10, [rk3] ;zmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+
+ sub arg3, 256
+ cmp arg3, 256
+ jl _fold_128_B_loop
+
+ vmovdqu8 zmm7, [arg2+16*8]
+ vmovdqu8 zmm8, [arg2+16*12]
+ vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
+ sub arg3, 256
+
+_fold_256_B_loop:
+ add arg2, 256
+ vmovdqu8 zmm3, [arg2+16*0]
+ vpclmulqdq zmm1, zmm0, zmm16, 0x10
+ vpclmulqdq zmm2, zmm0, zmm16, 0x01
+ vpxorq zmm0, zmm1, zmm2
+ vpxorq zmm0, zmm0, zmm3
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpclmulqdq zmm5, zmm4, zmm16, 0x10
+ vpclmulqdq zmm6, zmm4, zmm16, 0x01
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ vmovdqu8 zmm11, [arg2+16*8]
+ vpclmulqdq zmm12, zmm7, zmm16, 0x10
+ vpclmulqdq zmm13, zmm7, zmm16, 0x01
+ vpxorq zmm7, zmm12, zmm13
+ vpxorq zmm7, zmm7, zmm11
+
+ vmovdqu8 zmm17, [arg2+16*12]
+ vpclmulqdq zmm14, zmm8, zmm16, 0x10
+ vpclmulqdq zmm15, zmm8, zmm16, 0x01
+ vpxorq zmm8, zmm14, zmm15
+ vpxorq zmm8, zmm8, zmm17
+
+ sub arg3, 256
+ jge _fold_256_B_loop
+
+ ;; Fold 256 into 128
+ add arg2, 256
+ vpclmulqdq zmm1, zmm0, zmm10, 0x01
+ vpclmulqdq zmm2, zmm0, zmm10, 0x10
+ vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
+
+ vpclmulqdq zmm5, zmm4, zmm10, 0x01
+ vpclmulqdq zmm6, zmm4, zmm10, 0x10
+ vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
+
+ vmovdqa32 zmm0, zmm7
+ vmovdqa32 zmm4, zmm8
+
+ add arg3, 128
+ jmp _fold_128_B_register
+
+ ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
+_fold_128_B_loop:
+ add arg2, 128 ; update the buffer pointer
+ vmovdqu8 zmm8, [arg2+16*0]
+ vpclmulqdq zmm1, zmm0, zmm10, 0x10
+ vpclmulqdq zmm2, zmm0, zmm10, 0x01
+ vpxorq zmm0, zmm1, zmm2
+ vpxorq zmm0, zmm0, zmm8
+
+ vmovdqu8 zmm9, [arg2+16*4]
+ vpclmulqdq zmm5, zmm4, zmm10, 0x10
+ vpclmulqdq zmm6, zmm4, zmm10, 0x01
+ vpxorq zmm4, zmm5, zmm6
+ vpxorq zmm4, zmm4, zmm9
+
+ sub arg3, 128
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
+
+_fold_128_B_register:
+ ; fold the 8 128b parts into 1 xmm register with different constants
+ vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
+ vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
+ vpclmulqdq zmm1, zmm0, zmm16, 0x01
+ vpclmulqdq zmm2, zmm0, zmm16, 0x10
+ vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
+
+ vpclmulqdq zmm5, zmm4, zmm11, 0x01
+ vpclmulqdq zmm6, zmm4, zmm11, 0x10
+ vmovdqa xmm10, [rk1] ; Needed later in reduction loop
+ vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
+ vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
+
+ vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
+ vpxorq ymm8, ymm8, ymm1
+ vextracti64x2 xmm5, ymm8, 1
+ vpxorq xmm7, xmm5, xmm8
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ vmovdqa xmm8, xmm7
+ vpclmulqdq xmm7, xmm10, 0x1
+ vpclmulqdq xmm8, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vmovdqu xmm0, [arg2]
+ vpxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ add arg3, 16
+ je _128_done
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset
+ ; the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+
+
+ vmovdqa xmm2, xmm7
+ vmovdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ vmovdqu xmm0, [rax]
+
+
+ vpshufb xmm7, xmm0
+ vpxor xmm0, [mask3]
+ vpshufb xmm2, xmm0
+
+ vpblendvb xmm2, xmm2, xmm1, xmm0
+ ;;;;;;;;;;
+ vmovdqa xmm8, xmm7
+ vpclmulqdq xmm7, xmm10, 0x1
+
+ vpclmulqdq xmm8, xmm10, 0x10
+ vpxor xmm7, xmm8
+ vpxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ vmovdqa xmm10, [rk5]
+ vmovdqa xmm0, xmm7
+
+ ;64b fold
+ vpclmulqdq xmm7, xmm10, 0
+ vpsrldq xmm0, 8
+ vpxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ vmovdqa xmm1, xmm7
+ vmovdqa xmm10, [rk7]
+
+ vpclmulqdq xmm7, xmm10, 0
+ vmovdqa xmm2, xmm7
+ vpclmulqdq xmm7, xmm10, 0x10
+ vpslldq xmm2, 8
+ vpxor xmm7, xmm2
+ vpxor xmm7, xmm1
+ vpextrq rax, xmm7, 1
+
+_cleanup:
+ not rax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+
+ ; if there is, load the constants
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ vmovq xmm0, arg1 ; get the initial crc value
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ vmovq xmm0, arg1 ; get the initial crc value
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ vmovdqu xmm7, [arg2] ; load the plaintext
+ vpxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ vpxor xmm1, xmm1
+ mov r11, rsp
+ vmovdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ vmovdqa xmm7, [rsp]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ vmovdqu xmm0, [rax + r9]
+ vpshufb xmm7,xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Left shift (8-length) bytes in XMM
+ vmovdqu xmm0, [rax + r9 + 8]
+ vpshufb xmm7,xmm0
+
+ jmp _barrett
+
+align 16
+_exact_16_left:
+ vmovdqu xmm7, [arg2]
+ vpxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+align 32
+
+%ifndef USE_CONSTS
+; precomputed constants
+rk_1: dq 0x45000000b0000000
+rk_2: dq 0x6b700000f5000000
+rk1: dq 0xf500000000000001
+rk2: dq 0x6b70000000000001
+rk3: dq 0xb001000000010000
+rk4: dq 0xf501b0000001b000
+rk5: dq 0xf500000000000001
+rk6: dq 0x0000000000000000
+rk7: dq 0xb000000000000001
+rk8: dq 0xb000000000000000
+rk9: dq 0xe014514514501501
+rk10: dq 0x771db6db6db71c71
+rk11: dq 0xa101101101110001
+rk12: dq 0x1ab1ab1ab1aab001
+rk13: dq 0xf445014445000001
+rk14: dq 0x6aab71daab700001
+rk15: dq 0xb100010100000001
+rk16: dq 0x01b001b1b0000001
+rk17: dq 0xe145150000000001
+rk18: dq 0x76db6c7000000001
+rk19: dq 0xa011000000000001
+rk20: dq 0x1b1ab00000000001
+
+rk_1b: dq 0xf500000000000001
+rk_2b: dq 0x6b70000000000001
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+%else
+INCLUDE_CONSTS
+%endif
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3: dq 0x8080808080808080, 0x8080808080808080
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_ %+ FUNCTION_NAME
+no_ %+ FUNCTION_NAME %+ :
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/isa-l/crc/crc64_iso_refl_by8.asm b/src/isa-l/crc/crc64_iso_refl_by8.asm
new file mode 100644
index 000000000..b6dfcf0e4
--- /dev/null
+++ b/src/isa-l/crc/crc64_iso_refl_by8.asm
@@ -0,0 +1,545 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; uint64_t crc64_iso_refl_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+
+align 16
+mk_global crc64_iso_refl_by8, function
+crc64_iso_refl_by8:
+ endbranch
+ ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
+ not arg1
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm1, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm3, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm5, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm7, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+ ; xmm0 to xmm7
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+ ;xmm1 to xmm7
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+ ; xmm6 to xmm7
+ movdqa xmm10, [rk1]
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ add arg3, 16
+ je _128_done
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+
+
+ movdqa xmm2, xmm7
+ movdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ movdqu xmm0, [rax]
+
+
+ pshufb xmm7, xmm0
+ pxor xmm0, [mask3]
+ pshufb xmm2, xmm0
+
+ pblendvb xmm2, xmm1 ;xmm0 is implicit
+ ;;;;;;;;;;
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5]
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0
+ psrldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm1, xmm7
+ movdqa xmm10, [rk7]
+
+ pclmulqdq xmm7, xmm10, 0
+ movdqa xmm2, xmm7
+ pclmulqdq xmm7, xmm10, 0x10
+ pslldq xmm2, 8
+ pxor xmm7, xmm2
+ pxor xmm7, xmm1
+ pextrq rax, xmm7, 1
+
+_cleanup:
+ ; return c ^ 0xffffffff, ffffffffL;
+ not rax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movq xmm0, arg1 ; get the initial crc value
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax + r9]
+ pshufb xmm7,xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Left shift (8-length) bytes in XMM
+ movdqu xmm0, [rax + r9 + 8]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+; rk7 = floor(2^128/Q)
+; rk8 = Q
+rk1:
+DQ 0xf500000000000001
+rk2:
+DQ 0x6b70000000000001
+rk3:
+DQ 0xb001000000010000
+rk4:
+DQ 0xf501b0000001b000
+rk5:
+DQ 0xf500000000000001
+rk6:
+DQ 0x0000000000000000
+rk7:
+DQ 0xb000000000000001
+rk8:
+DQ 0xb000000000000000
+rk9:
+DQ 0xe014514514501501
+rk10:
+DQ 0x771db6db6db71c71
+rk11:
+DQ 0xa101101101110001
+rk12:
+DQ 0x1ab1ab1ab1aab001
+rk13:
+DQ 0xf445014445000001
+rk14:
+DQ 0x6aab71daab700001
+rk15:
+DQ 0xb100010100000001
+rk16:
+DQ 0x01b001b1b0000001
+rk17:
+DQ 0xe145150000000001
+rk18:
+DQ 0x76db6c7000000001
+rk19:
+DQ 0xa011000000000001
+rk20:
+DQ 0x1b1ab00000000001
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+
+mask:
+dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2:
+dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3:
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_iso_refl_by8, 01, 00, 0023
diff --git a/src/isa-l/crc/crc64_jones_norm_by16_10.asm b/src/isa-l/crc/crc64_jones_norm_by16_10.asm
new file mode 100644
index 000000000..2c9836b95
--- /dev/null
+++ b/src/isa-l/crc/crc64_jones_norm_by16_10.asm
@@ -0,0 +1,61 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNCTION_NAME crc64_jones_norm_by16_10
+%define USE_CONSTS
+%macro INCLUDE_CONSTS 0
+rk_1: dq 0x44ff5212394b1c52
+rk_2: dq 0x956d6cb0582122b2
+rk1: dq 0x4445ed2750017038
+rk2: dq 0x698b74157cfbd736
+rk3: dq 0x0cfcfb5101c4b775
+rk4: dq 0x65403fd47cbec866
+rk5: dq 0x4445ed2750017038
+rk6: dq 0x0000000000000000
+rk7: dq 0xddf3eeb298be6cf8
+rk8: dq 0xad93d23594c935a9
+rk9: dq 0xd8dc208e2ba527b4
+rk10: dq 0xf032cfec76bb2bc5
+rk11: dq 0xb536044f357f4238
+rk12: dq 0xfdbf104d938ba67a
+rk13: dq 0xeeddad9297a843e7
+rk14: dq 0x3550bce629466473
+rk15: dq 0x4e501e58ca43d25e
+rk16: dq 0x13c961588f27f643
+rk17: dq 0x3b60d00dcb1099bc
+rk18: dq 0x44bf1f468c53b9a3
+rk19: dq 0x96f2236e317179ee
+rk20: dq 0xf00839aa0dd64bac
+rk_1b: dq 0x4445ed2750017038
+rk_2b: dq 0x698b74157cfbd736
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+%endm
+
+%include "crc64_iso_norm_by16_10.asm"
diff --git a/src/isa-l/crc/crc64_jones_norm_by8.asm b/src/isa-l/crc/crc64_jones_norm_by8.asm
new file mode 100644
index 000000000..0cf8b4ad9
--- /dev/null
+++ b/src/isa-l/crc/crc64_jones_norm_by8.asm
@@ -0,0 +1,582 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Function API:
+; uint64_t crc64_jones_norm_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+align 16
+mk_global crc64_jones_norm_by8, function
+crc64_jones_norm_by8:
+ endbranch
+
+ not arg1 ;~init_crc
+
+ sub rsp,VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+
+ ; check if smaller than 256
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+
+ ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+ ; because data will be byte-reflected and will align with initial crc at correct place.
+ pslldq xmm10, 8
+
+ movdqa xmm11, [SHUF_MASK]
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ pshufb xmm0, xmm11
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ pshufb xmm1, xmm11
+ pshufb xmm2, xmm11
+ pshufb xmm3, xmm11
+ pshufb xmm4, xmm11
+ pshufb xmm5, xmm11
+ pshufb xmm6, xmm11
+ pshufb xmm7, xmm11
+
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128 ; buf += 128;
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm1, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm3, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm5, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ pshufb xmm9, xmm11
+ pshufb xmm12, xmm11
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x0
+ pclmulqdq xmm8, xmm10 , 0x11
+ pclmulqdq xmm7, xmm10, 0x0
+ pclmulqdq xmm13, xmm10 , 0x11
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+
+ movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
+
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pshufb xmm0, xmm11
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ ; check if any more data to fold. If not, compute the CRC of the final 128 bits
+ add arg3, 16
+ je _128_done
+
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa xmm2, xmm7
+
+ movdqu xmm1, [arg2 - 16 + arg3]
+ pshufb xmm1, xmm11
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, arg3
+ movdqu xmm0, [rax]
+
+ ; shift xmm2 to the left by arg3 bytes
+ pshufb xmm2, xmm0
+
+ ; shift xmm7 to the right by 16-arg3 bytes
+ pxor xmm0, [mask1]
+ pshufb xmm7, xmm0
+ pblendvb xmm1, xmm2 ;xmm0 is implicit
+
+ ; fold 16 Bytes
+ movdqa xmm2, xmm1
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x11
+ pclmulqdq xmm8, xmm10, 0x0
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0x01 ; H*L
+ pslldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
+ movdqa xmm0, xmm7
+
+ movdqa xmm1, xmm7
+ pand xmm1, [mask3]
+ pclmulqdq xmm7, xmm10, 0x01
+ pxor xmm7, xmm1
+
+ pclmulqdq xmm7, xmm10, 0x11
+ pxor xmm7, xmm0
+ pextrq rax, xmm7, 0
+
+_cleanup:
+ not rax
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+ movdqa xmm11, [SHUF_MASK]
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0
+
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movdqa xmm11, [SHUF_MASK]
+
+ movq xmm0, arg1 ; get the initial crc value
+ pslldq xmm0, 8 ; align it to its correct place
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pshufb xmm7, xmm11 ; byte-reflect the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+_zero_left:
+ movdqa xmm7, [rsp]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ ; shl r9, 4
+ lea rax, [pshufb_shf_table + 16]
+ sub rax, r9
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax]
+ pxor xmm0, [mask1]
+
+ pshufb xmm7, xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Right shift (8-length) bytes in XMM
+ add rax, 8
+ movdqu xmm0, [rax]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pshufb xmm7, xmm11
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+
+rk1:
+DQ 0x4445ed2750017038
+rk2:
+DQ 0x698b74157cfbd736
+rk3:
+DQ 0x0cfcfb5101c4b775
+rk4:
+DQ 0x65403fd47cbec866
+rk5:
+DQ 0x4445ed2750017038
+rk6:
+DQ 0x0000000000000000
+rk7:
+DQ 0xddf3eeb298be6cf8
+rk8:
+DQ 0xad93d23594c935a9
+rk9:
+DQ 0xd8dc208e2ba527b4
+rk10:
+DQ 0xf032cfec76bb2bc5
+rk11:
+DQ 0xb536044f357f4238
+rk12:
+DQ 0xfdbf104d938ba67a
+rk13:
+DQ 0xeeddad9297a843e7
+rk14:
+DQ 0x3550bce629466473
+rk15:
+DQ 0x4e501e58ca43d25e
+rk16:
+DQ 0x13c961588f27f643
+rk17:
+DQ 0x3b60d00dcb1099bc
+rk18:
+DQ 0x44bf1f468c53b9a3
+rk19:
+DQ 0x96f2236e317179ee
+rk20:
+DQ 0xf00839aa0dd64bac
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+mask2:
+dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
+mask3:
+dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_jones_norm_by8, 01, 00, 0026
diff --git a/src/isa-l/crc/crc64_jones_refl_by16_10.asm b/src/isa-l/crc/crc64_jones_refl_by16_10.asm
new file mode 100644
index 000000000..39502729b
--- /dev/null
+++ b/src/isa-l/crc/crc64_jones_refl_by16_10.asm
@@ -0,0 +1,61 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNCTION_NAME crc64_jones_refl_by16_10
+%define USE_CONSTS
+%macro INCLUDE_CONSTS 0
+rk_1: dq 0x9471a5389095fe44
+rk_2: dq 0x9a8908341a6d6d52
+rk1: dq 0x381d0015c96f4444
+rk2: dq 0xd9d7be7d505da32c
+rk3: dq 0x768361524d29ed0b
+rk4: dq 0xcc26fa7c57f8054c
+rk5: dq 0x381d0015c96f4444
+rk6: dq 0x0000000000000000
+rk7: dq 0x3e6cfa329aef9f77
+rk8: dq 0x2b5926535897936a
+rk9: dq 0x5bc94ba8e2087636
+rk10: dq 0x6cf09c8f37710b75
+rk11: dq 0x3885fd59e440d95a
+rk12: dq 0xbccba3936411fb7e
+rk13: dq 0xe4dd0d81cbfce585
+rk14: dq 0xb715e37b96ed8633
+rk15: dq 0xf49784a634f014e4
+rk16: dq 0xaf86efb16d9ab4fb
+rk17: dq 0x7b3211a760160db8
+rk18: dq 0xa062b2319d66692f
+rk19: dq 0xef3d1d18ed889ed2
+rk20: dq 0x6ba4d760ab38201e
+rk_1b: dq 0x381d0015c96f4444
+rk_2b: dq 0xd9d7be7d505da32c
+ dq 0x0000000000000000
+ dq 0x0000000000000000
+%endm
+
+%include "crc64_iso_refl_by16_10.asm"
diff --git a/src/isa-l/crc/crc64_jones_refl_by8.asm b/src/isa-l/crc/crc64_jones_refl_by8.asm
new file mode 100644
index 000000000..eea9c8ddf
--- /dev/null
+++ b/src/isa-l/crc/crc64_jones_refl_by8.asm
@@ -0,0 +1,545 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Function API:
+; uint64_t crc64_jones_refl_by8(
+; uint64_t init_crc, //initial CRC value, 64 bits
+; const unsigned char *buf, //buffer pointer to calculate CRC on
+; uint64_t len //buffer length in bytes (64-bit data)
+; );
+;
+%include "reg_sizes.asm"
+
+%define fetch_dist 1024
+
+[bits 64]
+default rel
+
+section .text
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%define TMP 16*0
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_SAVE 16*2
+ %define VARIABLE_OFFSET 16*10+8
+%else
+ %define VARIABLE_OFFSET 16*2+8
+%endif
+
+
+align 16
+mk_global crc64_jones_refl_by8, function
+crc64_jones_refl_by8:
+ endbranch
+ ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
+ not arg1
+ sub rsp, VARIABLE_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; push the xmm registers into the stack to maintain
+ movdqa [rsp + XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + XMM_SAVE + 16*7], xmm13
+%endif
+
+ ; check if smaller than 256B
+ cmp arg3, 256
+
+ ; for sizes less than 256, we can't fold 128B at a time...
+ jl _less_than_256
+
+
+ ; load the initial crc value
+ movq xmm10, arg1 ; initial crc
+ ; receive the initial 128B data, xor the initial crc value
+ movdqu xmm0, [arg2+16*0]
+ movdqu xmm1, [arg2+16*1]
+ movdqu xmm2, [arg2+16*2]
+ movdqu xmm3, [arg2+16*3]
+ movdqu xmm4, [arg2+16*4]
+ movdqu xmm5, [arg2+16*5]
+ movdqu xmm6, [arg2+16*6]
+ movdqu xmm7, [arg2+16*7]
+
+ ; XOR the initial_crc value
+ pxor xmm0, xmm10
+ movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
+ ;imm value of pclmulqdq instruction will determine which constant to use
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; we subtract 256 instead of 128 to save one instruction from the loop
+ sub arg3, 256
+
+ ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
+ ; loop will fold 128B at a time until we have 128+y Bytes of buffer
+
+
+ ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
+_fold_128_B_loop:
+
+ ; update the buffer pointer
+ add arg2, 128
+
+ prefetchnta [arg2+fetch_dist+0]
+ movdqu xmm9, [arg2+16*0]
+ movdqu xmm12, [arg2+16*1]
+ movdqa xmm8, xmm0
+ movdqa xmm13, xmm1
+ pclmulqdq xmm0, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm1, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm0, xmm9
+ xorps xmm0, xmm8
+ pxor xmm1, xmm12
+ xorps xmm1, xmm13
+
+ prefetchnta [arg2+fetch_dist+32]
+ movdqu xmm9, [arg2+16*2]
+ movdqu xmm12, [arg2+16*3]
+ movdqa xmm8, xmm2
+ movdqa xmm13, xmm3
+ pclmulqdq xmm2, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm3, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm2, xmm9
+ xorps xmm2, xmm8
+ pxor xmm3, xmm12
+ xorps xmm3, xmm13
+
+ prefetchnta [arg2+fetch_dist+64]
+ movdqu xmm9, [arg2+16*4]
+ movdqu xmm12, [arg2+16*5]
+ movdqa xmm8, xmm4
+ movdqa xmm13, xmm5
+ pclmulqdq xmm4, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm5, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm4, xmm9
+ xorps xmm4, xmm8
+ pxor xmm5, xmm12
+ xorps xmm5, xmm13
+
+ prefetchnta [arg2+fetch_dist+96]
+ movdqu xmm9, [arg2+16*6]
+ movdqu xmm12, [arg2+16*7]
+ movdqa xmm8, xmm6
+ movdqa xmm13, xmm7
+ pclmulqdq xmm6, xmm10, 0x10
+ pclmulqdq xmm8, xmm10 , 0x1
+ pclmulqdq xmm7, xmm10, 0x10
+ pclmulqdq xmm13, xmm10 , 0x1
+ pxor xmm6, xmm9
+ xorps xmm6, xmm8
+ pxor xmm7, xmm12
+ xorps xmm7, xmm13
+
+ sub arg3, 128
+
+ ; check if there is another 128B in the buffer to be able to fold
+ jge _fold_128_B_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add arg2, 128
+ ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
+ ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+
+ ; fold the 8 xmm registers to 1 xmm register with different constants
+ ; xmm0 to xmm7
+ movdqa xmm10, [rk9]
+ movdqa xmm8, xmm0
+ pclmulqdq xmm0, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm0
+ ;xmm1 to xmm7
+ movdqa xmm10, [rk11]
+ movdqa xmm8, xmm1
+ pclmulqdq xmm1, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm1
+
+ movdqa xmm10, [rk13]
+ movdqa xmm8, xmm2
+ pclmulqdq xmm2, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+ movdqa xmm10, [rk15]
+ movdqa xmm8, xmm3
+ pclmulqdq xmm3, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm3
+
+ movdqa xmm10, [rk17]
+ movdqa xmm8, xmm4
+ pclmulqdq xmm4, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm4
+
+ movdqa xmm10, [rk19]
+ movdqa xmm8, xmm5
+ pclmulqdq xmm5, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ xorps xmm7, xmm5
+ ; xmm6 to xmm7
+ movdqa xmm10, [rk1]
+ movdqa xmm8, xmm6
+ pclmulqdq xmm6, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm6
+
+
+ ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
+ ; instead of a cmp instruction, we use the negative flag with the jl instruction
+ add arg3, 128-16
+ jl _final_reduction_for_128
+
+ ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
+ ; we can fold 16 bytes at a time if y>=16
+ ; continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ movdqu xmm0, [arg2]
+ pxor xmm7, xmm0
+ add arg2, 16
+ sub arg3, 16
+ ; instead of a cmp instruction, we utilize the flags with the jge instruction
+ ; equivalent of: cmp arg3, 16-16
+ ; check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ ;now we have 16+z bytes left to reduce, where 0<= z < 16.
+ ;first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ add arg3, 16
+ je _128_done
+ ; here we are getting data that is less than 16 bytes.
+ ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
+ ; after that the registers need to be adjusted.
+_get_last_two_xmms:
+
+
+ movdqa xmm2, xmm7
+ movdqu xmm1, [arg2 - 16 + arg3]
+
+ ; get rid of the extra data that was loaded before
+ ; load the shift constant
+ lea rax, [pshufb_shf_table]
+ add rax, arg3
+ movdqu xmm0, [rax]
+
+
+ pshufb xmm7, xmm0
+ pxor xmm0, [mask3]
+ pshufb xmm2, xmm0
+
+ pblendvb xmm2, xmm1 ;xmm0 is implicit
+ ;;;;;;;;;;
+ movdqa xmm8, xmm7
+ pclmulqdq xmm7, xmm10, 0x1
+
+ pclmulqdq xmm8, xmm10, 0x10
+ pxor xmm7, xmm8
+ pxor xmm7, xmm2
+
+_128_done:
+ ; compute crc of a 128-bit value
+ movdqa xmm10, [rk5]
+ movdqa xmm0, xmm7
+
+ ;64b fold
+ pclmulqdq xmm7, xmm10, 0
+ psrldq xmm0, 8
+ pxor xmm7, xmm0
+
+ ;barrett reduction
+_barrett:
+ movdqa xmm1, xmm7
+ movdqa xmm10, [rk7]
+
+ pclmulqdq xmm7, xmm10, 0
+ movdqa xmm2, xmm7
+ pclmulqdq xmm7, xmm10, 0x10
+ pslldq xmm2, 8
+ pxor xmm7, xmm2
+ pxor xmm7, xmm1
+ pextrq rax, xmm7, 1
+
+_cleanup:
+ ; return c ^ 0xffffffff, ffffffffL;
+ not rax
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + XMM_SAVE + 16*7]
+%endif
+ add rsp, VARIABLE_OFFSET
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_less_than_256:
+
+ ; check if there is enough buffer to be able to fold 16B at a time
+ cmp arg3, 32
+ jl _less_than_32
+
+ ; if there is, load the constants
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+
+ movq xmm0, arg1 ; get the initial crc value
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0
+
+ ; update the buffer pointer
+ add arg2, 16
+
+ ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
+ sub arg3, 32
+
+ jmp _16B_reduction_loop
+
+align 16
+_less_than_32:
+ ; mov initial crc to the return value. this is necessary for zero-length buffers.
+ mov rax, arg1
+ test arg3, arg3
+ je _cleanup
+
+ movq xmm0, arg1 ; get the initial crc value
+
+ cmp arg3, 16
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu xmm7, [arg2] ; load the plaintext
+ pxor xmm7, xmm0 ; xor the initial crc value
+ add arg2, 16
+ sub arg3, 16
+ movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+align 16
+_less_than_16_left:
+ ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
+
+ pxor xmm1, xmm1
+ mov r11, rsp
+ movdqa [r11], xmm1
+
+ ; backup the counter value
+ mov r9, arg3
+ cmp arg3, 8
+ jl _less_than_8_left
+
+ ; load 8 Bytes
+ mov rax, [arg2]
+ mov [r11], rax
+ add r11, 8
+ sub arg3, 8
+ add arg2, 8
+_less_than_8_left:
+
+ cmp arg3, 4
+ jl _less_than_4_left
+
+ ; load 4 Bytes
+ mov eax, [arg2]
+ mov [r11], eax
+ add r11, 4
+ sub arg3, 4
+ add arg2, 4
+_less_than_4_left:
+
+ cmp arg3, 2
+ jl _less_than_2_left
+
+ ; load 2 Bytes
+ mov ax, [arg2]
+ mov [r11], ax
+ add r11, 2
+ sub arg3, 2
+ add arg2, 2
+_less_than_2_left:
+ cmp arg3, 1
+ jl _zero_left
+
+ ; load 1 Byte
+ mov al, [arg2]
+ mov [r11], al
+
+_zero_left:
+ movdqa xmm7, [rsp]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ lea rax,[pshufb_shf_table]
+
+ cmp r9, 8
+ jl _end_1to7
+
+_end_8to15:
+ movdqu xmm0, [rax + r9]
+ pshufb xmm7,xmm0
+ jmp _128_done
+
+_end_1to7:
+ ; Left shift (8-length) bytes in XMM
+ movdqu xmm0, [rax + r9 + 8]
+ pshufb xmm7,xmm0
+
+ jmp _barrett
+
+align 16
+_exact_16_left:
+ movdqu xmm7, [arg2]
+ pxor xmm7, xmm0 ; xor the initial crc value
+
+ jmp _128_done
+
+section .data
+
+; precomputed constants
+align 16
+; rk7 = floor(2^128/Q)
+; rk8 = Q
+rk1:
+DQ 0x381d0015c96f4444
+rk2:
+DQ 0xd9d7be7d505da32c
+rk3:
+DQ 0x768361524d29ed0b
+rk4:
+DQ 0xcc26fa7c57f8054c
+rk5:
+DQ 0x381d0015c96f4444
+rk6:
+DQ 0x0000000000000000
+rk7:
+DQ 0x3e6cfa329aef9f77
+rk8:
+DQ 0x2b5926535897936a
+rk9:
+DQ 0x5bc94ba8e2087636
+rk10:
+DQ 0x6cf09c8f37710b75
+rk11:
+DQ 0x3885fd59e440d95a
+rk12:
+DQ 0xbccba3936411fb7e
+rk13:
+DQ 0xe4dd0d81cbfce585
+rk14:
+DQ 0xb715e37b96ed8633
+rk15:
+DQ 0xf49784a634f014e4
+rk16:
+DQ 0xaf86efb16d9ab4fb
+rk17:
+DQ 0x7b3211a760160db8
+rk18:
+DQ 0xa062b2319d66692f
+rk19:
+DQ 0xef3d1d18ed889ed2
+rk20:
+DQ 0x6ba4d760ab38201e
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+
+mask:
+dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+mask2:
+dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+mask3:
+dq 0x8080808080808080, 0x8080808080808080
+
+;;; func core, ver, snum
+slversion crc64_jones_refl_by8, 01, 00, 0029
diff --git a/src/isa-l/crc/crc64_multibinary.asm b/src/isa-l/crc/crc64_multibinary.asm
new file mode 100644
index 000000000..3e06a0ecb
--- /dev/null
+++ b/src/isa-l/crc/crc64_multibinary.asm
@@ -0,0 +1,92 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;
+;;; uint64_t crc64_func(uint64_t init_crc, const unsigned char *buf, uint64_t len);
+;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern crc64_ecma_refl_by8
+extern crc64_ecma_refl_base
+
+extern crc64_ecma_norm_by8
+extern crc64_ecma_norm_base
+
+extern crc64_iso_refl_by8
+extern crc64_iso_refl_base
+
+extern crc64_iso_norm_by8
+extern crc64_iso_norm_base
+
+extern crc64_jones_refl_by8
+extern crc64_jones_refl_base
+
+extern crc64_jones_norm_by8
+extern crc64_jones_norm_base
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern crc64_iso_refl_by16_10
+extern crc64_iso_norm_by16_10
+extern crc64_jones_refl_by16_10
+extern crc64_jones_norm_by16_10
+extern crc64_ecma_refl_by16_10
+extern crc64_ecma_norm_by16_10
+%endif
+
+section .text
+
+%include "multibinary.asm"
+
+mbin_interface crc64_ecma_refl
+mbin_dispatch_init7 crc64_ecma_refl, crc64_ecma_refl_base, crc64_ecma_refl_by8, crc64_ecma_refl_by8, crc64_ecma_refl_by8, crc64_ecma_refl_by8, crc64_ecma_refl_by16_10
+mbin_interface crc64_ecma_norm
+mbin_dispatch_init7 crc64_ecma_norm, crc64_ecma_norm_base, crc64_ecma_norm_by8, crc64_ecma_norm_by8, crc64_ecma_norm_by8, crc64_ecma_norm_by8, crc64_ecma_norm_by16_10
+
+mbin_interface crc64_iso_refl
+mbin_dispatch_init7 crc64_iso_refl, crc64_iso_refl_base, crc64_iso_refl_by8, crc64_iso_refl_by8, crc64_iso_refl_by8, crc64_iso_refl_by8, crc64_iso_refl_by16_10
+mbin_interface crc64_iso_norm
+mbin_dispatch_init7 crc64_iso_norm, crc64_iso_norm_base, crc64_iso_norm_by8, crc64_iso_norm_by8, crc64_iso_norm_by8, crc64_iso_norm_by8, crc64_iso_norm_by16_10
+
+mbin_interface crc64_jones_refl
+mbin_dispatch_init7 crc64_jones_refl, crc64_jones_refl_base, crc64_jones_refl_by8, crc64_jones_refl_by8, crc64_jones_refl_by8, crc64_jones_refl_by8, crc64_jones_refl_by16_10
+mbin_interface crc64_jones_norm
+mbin_dispatch_init7 crc64_jones_norm, crc64_jones_norm_base, crc64_jones_norm_by8, crc64_jones_norm_by8, crc64_jones_norm_by8, crc64_jones_norm_by8, crc64_jones_norm_by16_10
+
+;;; func core, ver, snum
+slversion crc64_ecma_refl, 00, 00, 001b
+slversion crc64_ecma_norm, 00, 00, 0018
+slversion crc64_iso_refl, 00, 00, 0021
+slversion crc64_iso_norm, 00, 00, 001e
+slversion crc64_jones_refl, 00, 00, 0027
+slversion crc64_jones_norm, 00, 00, 0024
diff --git a/src/isa-l/crc/crc64_ref.h b/src/isa-l/crc/crc64_ref.h
new file mode 100644
index 000000000..b30f63866
--- /dev/null
+++ b/src/isa-l/crc/crc64_ref.h
@@ -0,0 +1,148 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _CRC64_REF_H
+#define _CRC64_REF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "crc64.h"
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+#define MAX_ITER 8
+
+// crc64_ecma reference function, slow crc64 from the definition.
+static inline uint64_t crc64_ecma_refl_ref(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0xC96C5795D7870F42ULL; // ECMA-182 standard reflected
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (uint64_t) buf[i];
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x1ULL ? poly : 0) ^ (rem >> 1);
+ }
+ }
+ return ~rem;
+}
+
+static inline uint64_t crc64_ecma_norm_ref(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0x42F0E1EBA9EA3693ULL; // ECMA-182 standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ ((uint64_t) buf[i] << 56);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x8000000000000000ULL ? poly : 0) ^ (rem << 1);
+ }
+ }
+ return ~rem;
+}
+
+// crc64_iso reference function, slow crc64 from the definition.
+static inline uint64_t crc64_iso_refl_ref(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0xD800000000000000ULL; // ISO standard reflected
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (uint64_t) buf[i];
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x1ULL ? poly : 0) ^ (rem >> 1);
+ }
+ }
+ return ~rem;
+}
+
+static inline uint64_t crc64_iso_norm_ref(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0x000000000000001BULL; // ISO standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ ((uint64_t) buf[i] << 56);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x8000000000000000ULL ? poly : 0) ^ (rem << 1);
+ }
+ }
+ return ~rem;
+}
+
+// crc64_jones reference function, slow crc64 from the definition.
+static inline uint64_t crc64_jones_refl_ref(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0x95ac9329ac4bc9b5ULL; // Jones coefficients reflected
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (uint64_t) buf[i];
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x1ULL ? poly : 0) ^ (rem >> 1);
+ }
+ }
+ return ~rem;
+}
+
+static inline uint64_t crc64_jones_norm_ref(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint64_t poly = 0xad93d23594c935a9ULL; // Jones coefficients
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ ((uint64_t) buf[i] << 56);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x8000000000000000ULL ? poly : 0) ^ (rem << 1);
+ }
+ }
+ return ~rem;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/isa-l/crc/crc_base.c b/src/isa-l/crc/crc_base.c
new file mode 100644
index 000000000..d1eb2d22e
--- /dev/null
+++ b/src/isa-l/crc/crc_base.c
@@ -0,0 +1,351 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "crc.h"
+
+static const uint16_t crc16tab[256] = {
+ 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+ 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+ 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+ 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+ 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+ 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+ 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+ 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+ 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+ 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+ 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+ 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+ 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+ 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+ 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+ 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+ 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+ 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+ 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+ 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+ 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+ 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+ 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+ 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+ 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+ 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+ 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+ 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+ 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+ 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+ 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+ 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+static const uint32_t crc32_table_iscsi_refl[256] = {
+ 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
+ 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+ 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
+ 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+ 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
+ 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+ 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
+ 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+ 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
+ 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+ 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
+ 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+ 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
+ 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+ 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
+ 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+ 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
+ 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+ 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
+ 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+ 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
+ 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+ 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
+ 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+ 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
+ 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+ 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
+ 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+ 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
+ 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+ 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
+ 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+ 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
+ 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+ 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
+ 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+ 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
+ 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+ 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
+ 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+ 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
+ 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+ 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
+ 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+ 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
+ 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+ 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
+ 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+ 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
+ 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+ 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
+ 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+ 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
+ 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+ 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
+ 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+ 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
+ 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+ 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
+ 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+ 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
+ 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+ 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
+ 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351
+};
+
+static const uint32_t crc32_table_ieee_norm[256] = {
+ 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9,
+ 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
+ 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
+ 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
+ 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9,
+ 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
+ 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011,
+ 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
+ 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
+ 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
+ 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81,
+ 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
+ 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49,
+ 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
+ 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
+ 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
+ 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae,
+ 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
+ 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16,
+ 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
+ 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
+ 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
+ 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066,
+ 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
+ 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e,
+ 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
+ 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
+ 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
+ 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e,
+ 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
+ 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686,
+ 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
+ 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
+ 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
+ 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f,
+ 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
+ 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47,
+ 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
+ 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
+ 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
+ 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7,
+ 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
+ 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f,
+ 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
+ 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
+ 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
+ 0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f,
+ 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
+ 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640,
+ 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
+ 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
+ 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
+ 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30,
+ 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
+ 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088,
+ 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
+ 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
+ 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
+ 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18,
+ 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
+ 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0,
+ 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
+ 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
+ 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
+};
+
+static const uint32_t crc32_table_gzip_refl[256] = {
+ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+ 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+ 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+ 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+ 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+ 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+ 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+ 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+ 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+ 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+ 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+ 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+ 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+ 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+ 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+ 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+ 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+ 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+ 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+ 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+ 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+ 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+ 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+ 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+ 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+ 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+ 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+ 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+ 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+ 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+ 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+ 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+ 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+ 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+ 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+ 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+ 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+ 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+ 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+ 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+ 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+ 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+ 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+ 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+ 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+ 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+ 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+ 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+ 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+uint16_t crc16_t10dif_base(uint16_t seed, uint8_t * buf, uint64_t len)
+{
+ int i;
+ uint16_t crc = seed;
+
+ for (i = 0; i < len; i++)
+ crc = (crc << 8) ^ crc16tab[((crc >> 8) ^ *buf++) & 0x00FF];
+
+ return crc;
+}
+
+uint16_t crc16_t10dif_copy_base(uint16_t seed, uint8_t * dst, uint8_t * src, uint64_t len)
+{
+ int i;
+ uint16_t crc = seed;
+
+ for (i = 0; i < len; i++) {
+ crc = (crc << 8) ^ crc16tab[((crc >> 8) ^ *src) & 0x00FF];
+ *dst++ = *src++;
+ }
+
+ return crc;
+}
+
+unsigned int crc32_iscsi_base(unsigned char *buffer, int len, unsigned int crc_init)
+{
+ unsigned int crc;
+ unsigned char *p_buf;
+ unsigned char *p_end = buffer + len;
+
+ p_buf = buffer;
+ crc = crc_init;
+
+ while (p_buf < p_end) {
+ crc = (crc >> 8) ^ crc32_table_iscsi_refl[(crc & 0x000000FF) ^ *p_buf++];
+ }
+ return crc;
+}
+
+uint32_t crc32_ieee_base(uint32_t seed, uint8_t * buf, uint64_t len)
+{
+ unsigned int crc = ~seed;
+
+ while (len--) {
+ crc = (crc << 8) ^ crc32_table_ieee_norm[((crc >> 24) ^ *buf) & 255];
+ buf++;
+ }
+
+ return ~crc;
+}
+
+uint32_t crc32_gzip_refl_base(uint32_t seed, uint8_t * buf, uint64_t len)
+{
+ unsigned int crc;
+ unsigned char *p_buf;
+ unsigned char *p_end = buf + len;
+
+ p_buf = (unsigned char *)buf;
+ crc = ~seed;
+
+ while (p_buf < p_end) {
+ crc = (crc >> 8) ^ crc32_table_gzip_refl[(crc & 0x000000FF) ^ *p_buf++];
+ }
+
+ return ~crc;
+}
+
+struct slver {
+ unsigned short snum;
+ unsigned char ver;
+ unsigned char core;
+};
+
+struct slver crc32_iscsi_base_slver_0001011d;
+struct slver crc32_iscsi_base_slver = { 0x011d, 0x02, 0x00 };
+
+struct slver crc16_t10dif_base_slver_0001011e;
+struct slver crc16_t10dif_base_slver = { 0x011e, 0x02, 0x00 };
+
+struct slver crc32_ieee_base_slver_0001011f;
+struct slver crc32_ieee_base_slver = { 0x011f, 0x02, 0x00 };
+
+struct slver crc32_gzip_refl_base_slver_0000002b;
+struct slver crc32_gzip_refl_base_slver = { 0x002b, 0x00, 0x00 };
diff --git a/src/isa-l/crc/crc_base_aliases.c b/src/isa-l/crc/crc_base_aliases.c
new file mode 100644
index 000000000..0ffc62f96
--- /dev/null
+++ b/src/isa-l/crc/crc_base_aliases.c
@@ -0,0 +1,87 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "crc.h"
+#include "crc64.h"
+#include <stdint.h>
+
+unsigned int crc32_iscsi(unsigned char *buffer, int len, unsigned int crc_init)
+{
+ return crc32_iscsi_base(buffer, len, crc_init);
+}
+
+uint16_t crc16_t10dif(uint16_t seed, const unsigned char *buf, uint64_t len)
+{
+ return crc16_t10dif_base(seed, (uint8_t *) buf, len);
+}
+
+uint16_t crc16_t10dif_copy(uint16_t seed, uint8_t * dst, uint8_t * src, uint64_t len)
+{
+ return crc16_t10dif_copy_base(seed, dst, src, len);
+}
+
+uint32_t crc32_ieee(uint32_t seed, const unsigned char *buf, uint64_t len)
+{
+ return crc32_ieee_base(seed, (uint8_t *) buf, len);
+}
+
+uint32_t crc32_gzip_refl(uint32_t seed, const unsigned char *buf, uint64_t len)
+{
+ return crc32_gzip_refl_base(seed, (uint8_t *) buf, len);
+}
+
+uint64_t crc64_ecma_refl(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_ecma_refl_base(seed, buf, len);
+}
+
+uint64_t crc64_ecma_norm(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_ecma_norm_base(seed, buf, len);
+}
+
+uint64_t crc64_iso_refl(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_iso_refl_base(seed, buf, len);
+}
+
+uint64_t crc64_iso_norm(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_iso_norm_base(seed, buf, len);
+}
+
+uint64_t crc64_jones_refl(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_jones_refl_base(seed, buf, len);
+}
+
+uint64_t crc64_jones_norm(uint64_t seed, const uint8_t * buf, uint64_t len)
+{
+ return crc64_jones_norm_base(seed, buf, len);
+}
diff --git a/src/isa-l/crc/crc_multibinary.asm b/src/isa-l/crc/crc_multibinary.asm
new file mode 100644
index 000000000..a28a468fc
--- /dev/null
+++ b/src/isa-l/crc/crc_multibinary.asm
@@ -0,0 +1,328 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern crc32_iscsi_00
+extern crc32_iscsi_01
+extern crc32_iscsi_base
+
+extern crc32_ieee_01
+extern crc32_ieee_by4 ;; Optimized for SLM
+extern crc32_ieee_02
+extern crc32_ieee_base
+
+extern crc16_t10dif_01
+extern crc16_t10dif_by4 ;; Optimized for SLM
+extern crc16_t10dif_02
+extern crc16_t10dif_base
+
+extern crc32_gzip_refl_by8
+extern crc32_gzip_refl_by8_02
+extern crc32_gzip_refl_base
+
+extern crc16_t10dif_copy_by4
+extern crc16_t10dif_copy_by4_02
+extern crc16_t10dif_copy_base
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern crc32_gzip_refl_by16_10
+extern crc32_ieee_by16_10
+extern crc32_iscsi_by16_10
+extern crc16_t10dif_by16_10
+%endif
+
+%include "multibinary.asm"
+
+section .data
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+crc32_iscsi_dispatched:
+ dq crc32_iscsi_mbinit
+
+crc32_ieee_dispatched:
+ dq crc32_ieee_mbinit
+
+crc16_t10dif_dispatched:
+ dq crc16_t10dif_mbinit
+
+section .text
+;;;;
+; crc32_iscsi multibinary function
+;;;;
+mk_global crc32_iscsi, function
+crc32_iscsi_mbinit:
+ endbranch
+ call crc32_iscsi_dispatch_init
+crc32_iscsi:
+ endbranch
+ jmp qword [crc32_iscsi_dispatched]
+
+crc32_iscsi_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ lea rsi, [crc32_iscsi_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_2
+ jz .crc_iscsi_init_done ; use iscsi_base
+ lea rsi, [crc32_iscsi_00 WRT_OPT]
+ test ecx, FLAG_CPUID1_ECX_CLMUL
+ jz .crc_iscsi_init_done ; use ieee_base
+ lea rsi, [crc32_iscsi_01 WRT_OPT]
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je .crc_iscsi_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne .crc_iscsi_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je .crc_iscsi_init_done
+ ;; AVX/02 opt if available
+
+%if AS_FEATURE_LEVEL >= 10
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je .crc_iscsi_init_done ; No AVX2 possible
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne .crc_iscsi_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ jne .crc_iscsi_init_done
+
+ and ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ lea rbx, [crc32_iscsi_by16_10 WRT_OPT] ; AVX512/10 opt
+ cmove rsi, rbx
+%endif
+
+.crc_iscsi_init_done:
+ mov [crc32_iscsi_dispatched], rsi
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+;;;;
+; crc32_ieee multibinary function
+;;;;
+mk_global crc32_ieee, function
+crc32_ieee_mbinit:
+ endbranch
+ call crc32_ieee_dispatch_init
+crc32_ieee:
+ endbranch
+ jmp qword [crc32_ieee_dispatched]
+
+crc32_ieee_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ lea rsi, [crc32_ieee_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE3
+ jz .crc_ieee_init_done ; use ieee_base
+ test ecx, FLAG_CPUID1_ECX_CLMUL
+ jz .crc_ieee_init_done ; use ieee_base
+ lea rsi, [crc32_ieee_01 WRT_OPT]
+
+ ;; Extra Avoton test
+ lea rdx, [crc32_ieee_by4 WRT_OPT]
+ and eax, FLAG_CPUID1_EAX_STEP_MASK
+ cmp eax, FLAG_CPUID1_EAX_AVOTON
+ cmove rsi, rdx
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je .crc_ieee_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne .crc_ieee_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je .crc_ieee_init_done
+ lea rsi, [crc32_ieee_02 WRT_OPT] ; AVX/02 opt
+
+%if AS_FEATURE_LEVEL >= 10
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je .crc_ieee_init_done ; No AVX2 possible
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne .crc_ieee_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ jne .crc_ieee_init_done
+
+ and ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ lea rbx, [crc32_ieee_by16_10 WRT_OPT] ; AVX512/10 opt
+ cmove rsi, rbx
+%endif
+
+.crc_ieee_init_done:
+ mov [crc32_ieee_dispatched], rsi
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+;;;;
+; crc16_t10dif multibinary function
+;;;;
+mk_global crc16_t10dif, function
+crc16_t10dif_mbinit:
+ endbranch
+ call crc16_t10dif_dispatch_init
+crc16_t10dif:
+ endbranch
+ jmp qword [crc16_t10dif_dispatched]
+
+crc16_t10dif_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ lea rsi, [crc16_t10dif_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE3
+ jz .t10dif_init_done ; use t10dif_base
+ test ecx, FLAG_CPUID1_ECX_CLMUL
+ jz .t10dif_init_done ; use t10dif_base
+ lea rsi, [crc16_t10dif_01 WRT_OPT]
+
+ ;; Extra Avoton test
+ lea rdx, [crc16_t10dif_by4 WRT_OPT]
+ and eax, FLAG_CPUID1_EAX_STEP_MASK
+ cmp eax, FLAG_CPUID1_EAX_AVOTON
+ cmove rsi, rdx
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je .t10dif_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne .t10dif_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je .t10dif_init_done
+ lea rsi, [crc16_t10dif_02 WRT_OPT] ; AVX/02 opt
+
+%if AS_FEATURE_LEVEL >= 10
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je .t10dif_init_done ; No AVX2 possible
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne .t10dif_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ jne .t10dif_init_done
+
+ and ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ lea rbx, [crc16_t10dif_by16_10 WRT_OPT] ; AVX512/10 opt
+ cmove rsi, rbx
+%endif
+
+.t10dif_init_done:
+ mov [crc16_t10dif_dispatched], rsi
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+mbin_interface crc32_gzip_refl
+mbin_dispatch_init_clmul crc32_gzip_refl, crc32_gzip_refl_base, crc32_gzip_refl_by8, crc32_gzip_refl_by8_02, crc32_gzip_refl_by16_10
+
+mbin_interface crc16_t10dif_copy
+mbin_dispatch_init_clmul crc16_t10dif_copy, crc16_t10dif_copy_base, crc16_t10dif_copy_by4, crc16_t10dif_copy_by4_02, crc16_t10dif_copy_by4_02
+
+;;; func core, ver, snum
+slversion crc16_t10dif, 00, 03, 011a
+slversion crc32_ieee, 00, 03, 011b
+slversion crc32_iscsi, 00, 03, 011c
+slversion crc32_gzip_refl, 00, 00, 002a
diff --git a/src/isa-l/crc/crc_ref.h b/src/isa-l/crc/crc_ref.h
new file mode 100644
index 000000000..e97a60b5e
--- /dev/null
+++ b/src/isa-l/crc/crc_ref.h
@@ -0,0 +1,140 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _CRC_REF_H
+#define _CRC_REF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "crc.h"
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+#define MAX_ITER 8
+
+// iSCSI CRC reference function
+static inline unsigned int crc32_iscsi_ref(unsigned char *buffer, int len, unsigned int crc_init)
+{
+ uint64_t rem = crc_init;
+ int i, j;
+
+ uint32_t poly = 0x82F63B78;
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (buffer[i]);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x1ULL) ? (rem >> 1) ^ poly : (rem >> 1);
+ }
+ }
+ return rem;
+}
+
+// crc16_t10dif reference function, slow crc16 from the definition.
+static inline uint16_t crc16_t10dif_ref(uint16_t seed, uint8_t * buf, uint64_t len)
+{
+ size_t rem = seed;
+ unsigned int i, j;
+
+ uint16_t poly = 0x8bb7; // t10dif standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (buf[i] << 8);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = rem << 1;
+ rem = (rem & 0x10000) ? rem ^ poly : rem;
+ }
+ }
+ return rem;
+}
+
+// crc16_t10dif reference function, slow crc16 from the definition.
+static inline uint16_t crc16_t10dif_copy_ref(uint16_t seed, uint8_t * dst, uint8_t * src, uint64_t len)
+{
+ size_t rem = seed;
+ unsigned int i, j;
+
+ uint16_t poly = 0x8bb7; // t10dif standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (src[i] << 8);
+ dst[i] = src[i];
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = rem << 1;
+ rem = (rem & 0x10000) ? rem ^ poly : rem;
+ }
+ }
+ return rem;
+}
+
+// crc32_ieee reference function, slow crc32 from the definition.
+static inline uint32_t crc32_ieee_ref(uint32_t seed, uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ unsigned int i, j;
+
+ uint32_t poly = 0x04C11DB7; // IEEE standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ ((uint64_t) buf[i] << 24);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = rem << 1;
+ rem = (rem & 0x100000000ULL) ? rem ^ poly : rem;
+ }
+ }
+ return ~rem;
+}
+
+// crc32_gzip_refl reference function, slow crc32 from the definition.
+// Please get difference details between crc32_gzip_ref and crc32_ieee
+// from crc.h.
+static inline uint32_t crc32_gzip_refl_ref(uint32_t seed, uint8_t * buf, uint64_t len)
+{
+ uint64_t rem = ~seed;
+ int i, j;
+
+ uint32_t poly = 0xEDB88320; // IEEE standard
+
+ for (i = 0; i < len; i++) {
+ rem = rem ^ (buf[i]);
+ for (j = 0; j < MAX_ITER; j++) {
+ rem = (rem & 0x1ULL) ? (rem >> 1) ^ poly : (rem >> 1);
+ }
+ }
+ return ~rem;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/isa-l/crc/crc_simple_test.c b/src/isa-l/crc/crc_simple_test.c
new file mode 100644
index 000000000..4799f8745
--- /dev/null
+++ b/src/isa-l/crc/crc_simple_test.c
@@ -0,0 +1,64 @@
+/**********************************************************************
+ Copyright(c) 2011-2013 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdint.h>
+#include "crc.h"
+
+const uint16_t init_crc_16 = 0x1234;
+const uint16_t t10_dif_expected = 0x60b3;
+const uint32_t init_crc_32 = 0x12345678;
+const uint32_t ieee_expected = 0x2ceadbe3;
+
+int main(void)
+{
+ unsigned char p_buf[48];
+ uint16_t t10_dif_computed;
+ uint32_t ieee_computed;
+ int i;
+
+ for (i = 0; i < 48; i++)
+ p_buf[i] = i;
+
+ t10_dif_computed = crc16_t10dif(init_crc_16, p_buf, 48);
+
+ if (t10_dif_computed != t10_dif_expected)
+ printf("WRONG CRC-16(T10 DIF) value\n");
+ else
+ printf("CORRECT CRC-16(T10 DIF) value\n");
+
+ ieee_computed = crc32_ieee(init_crc_32, p_buf, 48);
+
+ if (ieee_computed != ieee_expected)
+ printf("WRONG CRC-32(IEEE) value\n");
+ else
+ printf("CORRECT CRC-32(IEEE) value\n");
+
+ return 0;
+}