summaryrefslogtreecommitdiffstats
path: root/src/isa-l/include
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/isa-l/include
parentInitial commit. (diff)
downloadceph-upstream/18.2.2.tar.xz
ceph-upstream/18.2.2.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/isa-l/include')
-rw-r--r--src/isa-l/include/aarch64_multibinary.h311
-rw-r--r--src/isa-l/include/crc.h212
-rw-r--r--src/isa-l/include/crc64.h277
-rw-r--r--src/isa-l/include/erasure_code.h947
-rw-r--r--src/isa-l/include/gf_vect_mul.h152
-rw-r--r--src/isa-l/include/igzip_lib.h990
-rw-r--r--src/isa-l/include/mem_routines.h64
-rw-r--r--src/isa-l/include/multibinary.asm440
-rw-r--r--src/isa-l/include/raid.h305
-rw-r--r--src/isa-l/include/reg_sizes.asm291
-rw-r--r--src/isa-l/include/test.h285
-rw-r--r--src/isa-l/include/types.h77
-rw-r--r--src/isa-l/include/unaligned.h76
13 files changed, 4427 insertions, 0 deletions
diff --git a/src/isa-l/include/aarch64_multibinary.h b/src/isa-l/include/aarch64_multibinary.h
new file mode 100644
index 000000000..e31451be6
--- /dev/null
+++ b/src/isa-l/include/aarch64_multibinary.h
@@ -0,0 +1,311 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef __AARCH64_MULTIBINARY_H__
+#define __AARCH64_MULTIBINARY_H__
+#ifndef __aarch64__
+#error "This file is for aarch64 only"
+#endif
+#include <asm/hwcap.h>
+#ifdef __ASSEMBLY__
+/**
+ * # mbin_interface : the wrapper layer for isal-l api
+ *
+ * ## references:
+ * * https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=blob;f=sysdeps/aarch64/dl-trampoline.S
+ * * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+ * * https://static.docs.arm.com/ihi0057/b/IHI0057B_aadwarf64.pdf?_ga=2.80574487.1870739014.1564969896-1634778941.1548729310
+ *
+ * ## Usage:
+ * 1. Define dispather function
+ * 2. name must be \name\()_dispatcher
+ * 3. Prototype should be *"void * \name\()_dispatcher"*
+ * 4. The dispather should return the right function pointer , revision and a string information .
+ **/
+.macro mbin_interface name:req
+ .extern \name\()_dispatcher
+ .section .data
+ .balign 8
+ .global \name\()_dispatcher_info
+ .type \name\()_dispatcher_info,%object
+
+ \name\()_dispatcher_info:
+ .quad \name\()_mbinit //func_entry
+
+ .size \name\()_dispatcher_info,. - \name\()_dispatcher_info
+
+ .balign 8
+ .text
+ \name\()_mbinit:
+ //save lp fp, sub sp
+ .cfi_startproc
+ stp x29, x30, [sp, -224]!
+
+ //add cfi directive to avoid GDB bt cmds error
+ //set cfi(Call Frame Information)
+ .cfi_def_cfa_offset 224
+ .cfi_offset 29, -224
+ .cfi_offset 30, -216
+
+ //save parameter/result/indirect result registers
+ stp x8, x9, [sp, 16]
+ .cfi_offset 8, -208
+ .cfi_offset 9, -200
+ stp x0, x1, [sp, 32]
+ .cfi_offset 0, -192
+ .cfi_offset 1, -184
+ stp x2, x3, [sp, 48]
+ .cfi_offset 2, -176
+ .cfi_offset 3, -168
+ stp x4, x5, [sp, 64]
+ .cfi_offset 4, -160
+ .cfi_offset 5, -152
+ stp x6, x7, [sp, 80]
+ .cfi_offset 6, -144
+ .cfi_offset 7, -136
+ stp q0, q1, [sp, 96]
+ .cfi_offset 64, -128
+ .cfi_offset 65, -112
+ stp q2, q3, [sp, 128]
+ .cfi_offset 66, -96
+ .cfi_offset 67, -80
+ stp q4, q5, [sp, 160]
+ .cfi_offset 68, -64
+ .cfi_offset 69, -48
+ stp q6, q7, [sp, 192]
+ .cfi_offset 70, -32
+ .cfi_offset 71, -16
+
+ /**
+ * The dispatcher functions have the following prototype:
+ * void * function_dispatcher(void)
+ * As the dispatcher is returning a struct, by the AAPCS,
+ */
+
+
+ bl \name\()_dispatcher
+ //restore temp/indirect result registers
+ ldp x8, x9, [sp, 16]
+ .cfi_restore 8
+ .cfi_restore 9
+
+ // save function entry
+ str x0, [x9]
+
+ //restore parameter/result registers
+ ldp x0, x1, [sp, 32]
+ .cfi_restore 0
+ .cfi_restore 1
+ ldp x2, x3, [sp, 48]
+ .cfi_restore 2
+ .cfi_restore 3
+ ldp x4, x5, [sp, 64]
+ .cfi_restore 4
+ .cfi_restore 5
+ ldp x6, x7, [sp, 80]
+ .cfi_restore 6
+ .cfi_restore 7
+ ldp q0, q1, [sp, 96]
+ .cfi_restore 64
+ .cfi_restore 65
+ ldp q2, q3, [sp, 128]
+ .cfi_restore 66
+ .cfi_restore 67
+ ldp q4, q5, [sp, 160]
+ .cfi_restore 68
+ .cfi_restore 69
+ ldp q6, q7, [sp, 192]
+ .cfi_restore 70
+ .cfi_restore 71
+ //save lp fp and sp
+ ldp x29, x30, [sp], 224
+ //restore cfi setting
+ .cfi_restore 30
+ .cfi_restore 29
+ .cfi_def_cfa_offset 0
+ .cfi_endproc
+
+ .global \name
+ .type \name,%function
+ .align 2
+ \name\():
+ adrp x9, :got:\name\()_dispatcher_info
+ ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info]
+ ldr x10,[x9]
+ br x10
+ .size \name,. - \name
+
+.endm
+
+/**
+ * mbin_interface_base is used for the interfaces which have only
+ * noarch implementation
+ */
+.macro mbin_interface_base name:req, base:req
+ .extern \base
+ .section .data
+ .balign 8
+ .global \name\()_dispatcher_info
+ .type \name\()_dispatcher_info,%object
+
+ \name\()_dispatcher_info:
+ .quad \base //func_entry
+ .size \name\()_dispatcher_info,. - \name\()_dispatcher_info
+
+ .balign 8
+ .text
+ .global \name
+ .type \name,%function
+ .align 2
+ \name\():
+ adrp x9, :got:\name\()_dispatcher_info
+ ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info]
+ ldr x10,[x9]
+ br x10
+ .size \name,. - \name
+
+.endm
+
+#else /* __ASSEMBLY__ */
+#include <sys/auxv.h>
+
+
+
+#define DEFINE_INTERFACE_DISPATCHER(name) \
+ void * name##_dispatcher(void)
+
+#define PROVIDER_BASIC(name) \
+ PROVIDER_INFO(name##_base)
+
+#define DO_DIGNOSTIC(x) _Pragma GCC diagnostic ignored "-W"#x
+#define DO_PRAGMA(x) _Pragma (#x)
+#define DIGNOSTIC_IGNORE(x) DO_PRAGMA(GCC diagnostic ignored #x)
+#define DIGNOSTIC_PUSH() DO_PRAGMA(GCC diagnostic push)
+#define DIGNOSTIC_POP() DO_PRAGMA(GCC diagnostic pop)
+
+
+#define PROVIDER_INFO(_func_entry) \
+ ({ DIGNOSTIC_PUSH() \
+ DIGNOSTIC_IGNORE(-Wnested-externs) \
+ extern void _func_entry(void); \
+ DIGNOSTIC_POP() \
+ _func_entry; \
+ })
+
+/**
+ * Micro-Architector definitions
+ * Reference: https://developer.arm.com/docs/ddi0595/f/aarch64-system-registers/midr_el1
+ */
+
+#define CPU_IMPLEMENTER_RESERVE 0x00
+#define CPU_IMPLEMENTER_ARM 0x41
+
+
+#define CPU_PART_CORTEX_A57 0xD07
+#define CPU_PART_CORTEX_A72 0xD08
+#define CPU_PART_NEOVERSE_N1 0xD0C
+
+#define MICRO_ARCH_ID(imp,part) \
+ (((CPU_IMPLEMENTER_##imp&0xff)<<24)|((CPU_PART_##part&0xfff)<<4))
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1<<11)
+#endif
+
+/**
+ * @brief get_micro_arch_id
+ *
+ * read micro-architector register instruction if possible.This function
+ * provides microarchitecture information and make microarchitecture optimization
+ * possible.
+ *
+ * Read system registers(MRS) is forbidden in userspace. If executed, it
+ * will raise illegal instruction error. Kernel provides a solution for
+ * this issue. The solution depends on HWCAP_CPUID flags. Reference(1)
+ * describes how to use it. It provides a "illegal insstruction" handler
+ * in kernel space, the handler will execute MRS and return the correct
+ * value to userspace.
+ *
+ * To avoid too many kernel trap, this function MUST be only called in
+ * dispatcher. And HWCAP must be match,That will make sure there are no
+ * illegal instruction errors. HWCAP_CPUID should be available to get the
+ * best performance.
+ *
+ * NOTICE:
+ * - HWCAP_CPUID should be available. Otherwise it returns reserve value
+ * - It MUST be called inside dispather.
+ * - It MUST meet the HWCAP requirements
+ *
+ * Example:
+ * DEFINE_INTERFACE_DISPATCHER(crc32_iscsi)
+ * {
+ * unsigned long auxval = getauxval(AT_HWCAP);
+ * // MUST do the judgement is MUST.
+ * if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) {
+ * switch (get_micro_arch_id()) {
+ * case MICRO_ARCH_ID(ARM, CORTEX_A57):
+ * return PROVIDER_INFO(crc32_pmull_crc_for_a57);
+ * case MICRO_ARCH_ID(ARM, CORTEX_A72):
+ * return PROVIDER_INFO(crc32_pmull_crc_for_a72);
+ * case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+ * return PROVIDER_INFO(crc32_pmull_crc_for_n1);
+ * case default:
+ * return PROVIDER_INFO(crc32_pmull_crc_for_others);
+ * }
+ * }
+ * return PROVIDER_BASIC(crc32_iscsi);
+ * }
+ * KNOWN ISSUE:
+ * On a heterogeneous system (big.LITTLE), it will work but the performance
+ * might not be the best one as expected.
+ *
+ * If this function is called on the big core, it will return the function
+ * optimized for the big core.
+ *
+ * If execution is then scheduled to the little core. It will still work (1),
+ * but the function won't be optimized for the little core, thus the performance
+ * won't be as expected.
+ *
+ * References:
+ * - [CPU Feature detection](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/arm64/cpu-feature-registers.rst?h=v5.5)
+ *
+ */
+static inline uint32_t get_micro_arch_id(void)
+{
+ uint32_t id=CPU_IMPLEMENTER_RESERVE;
+ if ((getauxval(AT_HWCAP) & HWCAP_CPUID)) {
+ /** Here will trap into kernel space */
+ asm("mrs %0, MIDR_EL1 " : "=r" (id));
+ }
+ return id&0xff00fff0;
+}
+
+
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/src/isa-l/include/crc.h b/src/isa-l/include/crc.h
new file mode 100644
index 000000000..071496083
--- /dev/null
+++ b/src/isa-l/include/crc.h
@@ -0,0 +1,212 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ * @file crc.h
+ * @brief CRC functions.
+ */
+
+
+#ifndef _CRC_H_
+#define _CRC_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Multi-binary functions */
+
+/**
+ * @brief Generate CRC from the T10 standard, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @returns 16 bit CRC
+ */
+uint16_t crc16_t10dif(
+ uint16_t init_crc, //!< initial CRC value, 16 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+
+/**
+ * @brief Generate CRC and copy T10 standard, runs appropriate version.
+ *
+ * Stitched CRC + copy function.
+ *
+ * @returns 16 bit CRC
+ */
+uint16_t crc16_t10dif_copy(
+ uint16_t init_crc, //!< initial CRC value, 16 bits
+ uint8_t *dst, //!< buffer destination for copy
+ uint8_t *src, //!< buffer source to crc + copy
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+
+/**
+ * @brief Generate CRC from the IEEE standard, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * Note: CRC32 IEEE standard is widely used in HDLC, Ethernet, Gzip and
+ * many others. Its polynomial is 0x04C11DB7 in normal and 0xEDB88320
+ * in reflection (or reverse). In ISA-L CRC, function crc32_ieee is
+ * actually designed for normal CRC32 IEEE version. And function
+ * crc32_gzip_refl is actually designed for reflected CRC32 IEEE.
+ * These two versions of CRC32 IEEE are not compatible with each other.
+ * Users who want to replace their not optimized crc32 ieee with ISA-L's
+ * crc32 function should be careful of that.
+ * Since many applications use CRC32 IEEE reflected version, Please have
+ * a check whether crc32_gzip_refl is right one for you instead of
+ * crc32_ieee.
+ *
+ * @returns 32 bit CRC
+ */
+
+uint32_t crc32_ieee(
+ uint32_t init_crc, //!< initial CRC value, 32 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate the customized CRC
+ * based on RFC 1952 CRC (http://www.ietf.org/rfc/rfc1952.txt) standard,
+ * runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * Note: CRC32 IEEE standard is widely used in HDLC, Ethernet, Gzip and
+ * many others. Its polynomial is 0x04C11DB7 in normal and 0xEDB88320
+ * in reflection (or reverse). In ISA-L CRC, function crc32_ieee is
+ * actually designed for normal CRC32 IEEE version. And function
+ * crc32_gzip_refl is actually designed for reflected CRC32 IEEE.
+ * These two versions of CRC32 IEEE are not compatible with each other.
+ * Users who want to replace their not optimized crc32 ieee with ISA-L's
+ * crc32 function should be careful of that.
+ * Since many applications use CRC32 IEEE reflected version, Please have
+ * a check whether crc32_gzip_refl is right one for you instead of
+ * crc32_ieee.
+ *
+ * @returns 32 bit CRC
+ */
+uint32_t crc32_gzip_refl(
+ uint32_t init_crc, //!< initial CRC value, 32 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+
+/**
+ * @brief ISCSI CRC function, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @returns 32 bit CRC
+ */
+unsigned int crc32_iscsi(
+ unsigned char *buffer, //!< buffer to calculate CRC on
+ int len, //!< buffer length in bytes
+ unsigned int init_crc //!< initial CRC value
+ );
+
+
+/* Base functions */
+
+/**
+ * @brief ISCSI CRC function, baseline version
+ * @returns 32 bit CRC
+ */
+unsigned int crc32_iscsi_base(
+ unsigned char *buffer, //!< buffer to calculate CRC on
+ int len, //!< buffer length in bytes
+ unsigned int crc_init //!< initial CRC value
+ );
+
+
+/**
+ * @brief Generate CRC from the T10 standard, runs baseline version
+ * @returns 16 bit CRC
+ */
+uint16_t crc16_t10dif_base(
+ uint16_t seed, //!< initial CRC value, 16 bits
+ uint8_t *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+
+/**
+ * @brief Generate CRC and copy T10 standard, runs baseline version.
+ * @returns 16 bit CRC
+ */
+uint16_t crc16_t10dif_copy_base(
+ uint16_t init_crc, //!< initial CRC value, 16 bits
+ uint8_t *dst, //!< buffer destination for copy
+ uint8_t *src, //!< buffer source to crc + copy
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+
+/**
+ * @brief Generate CRC from the IEEE standard, runs baseline version
+ * @returns 32 bit CRC
+ */
+uint32_t crc32_ieee_base(
+ uint32_t seed, //!< initial CRC value, 32 bits
+ uint8_t *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate the customized CRC
+ * based on RFC 1952 CRC (http://www.ietf.org/rfc/rfc1952.txt) standard,
+ * runs baseline version
+ * @returns 32 bit CRC
+ */
+uint32_t crc32_gzip_refl_base(
+ uint32_t seed, //!< initial CRC value, 32 bits
+ uint8_t *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CRC_H_
diff --git a/src/isa-l/include/crc64.h b/src/isa-l/include/crc64.h
new file mode 100644
index 000000000..d0e02748c
--- /dev/null
+++ b/src/isa-l/include/crc64.h
@@ -0,0 +1,277 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ * @file crc64.h
+ * @brief CRC64 functions.
+ */
+
+
+#ifndef _CRC64_H_
+#define _CRC64_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Multi-binary functions */
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in reflected format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_ecma_refl(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in normal format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_ecma_norm(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ISO standard in reflected format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_iso_refl(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ISO standard in normal format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_iso_norm(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in reflected format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_jones_refl(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in normal format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_jones_norm(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/* Arch specific versions */
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in reflected format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_ecma_refl_by8(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in normal format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_ecma_norm_by8(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in reflected format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_ecma_refl_base(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in normal format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_ecma_norm_base(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ISO standard in reflected format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_iso_refl_by8(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ISO standard in normal format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_iso_norm_by8(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ISO standard in reflected format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_iso_refl_base(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from ISO standard in normal format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_iso_norm_base(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in reflected format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_jones_refl_by8(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in normal format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_jones_norm_by8(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in reflected format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_jones_refl_base(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in normal format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_jones_norm_base(
+ uint64_t init_crc, //!< initial CRC value, 64 bits
+ const unsigned char *buf, //!< buffer to calculate CRC on
+ uint64_t len //!< buffer length in bytes (64-bit data)
+ );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CRC64_H_
diff --git a/src/isa-l/include/erasure_code.h b/src/isa-l/include/erasure_code.h
new file mode 100644
index 000000000..2f9a257e5
--- /dev/null
+++ b/src/isa-l/include/erasure_code.h
@@ -0,0 +1,947 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _ERASURE_CODE_H_
+#define _ERASURE_CODE_H_
+
+/**
+ * @file erasure_code.h
+ * @brief Interface to functions supporting erasure code encode and decode.
+ *
+ * This file defines the interface to optimized functions used in erasure
+ * codes. Encode and decode of erasures in GF(2^8) are made by calculating the
+ * dot product of the symbols (bytes in GF(2^8)) across a set of buffers and a
+ * set of coefficients. Values for the coefficients are determined by the type
+ * of erasure code. Using a general dot product means that any sequence of
+ * coefficients may be used including erasure codes based on random
+ * coefficients.
+ * Multiple versions of dot product are supplied to calculate 1-6 output
+ * vectors in one pass.
+ * Base GF multiply and divide functions can be sped up by defining
+ * GF_LARGE_TABLES at the expense of memory size.
+ *
+ */
+
+#include "gf_vect_mul.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Initialize tables for fast Erasure Code encode and decode.
+ *
+ * Generates the expanded tables needed for fast encode or decode for erasure
+ * codes on blocks of data. 32bytes is generated for each input coefficient.
+ *
+ * @param k The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param rows The number of output vectors to concurrently encode/decode.
+ * @param a Pointer to sets of arrays of input coefficients used to encode
+ * or decode data.
+ * @param gftbls Pointer to start of space for concatenated output tables
+ * generated from input coefficients. Must be of size 32*k*rows.
+ * @returns none
+ */
+
+void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
+ *
+ * Given a list of source data blocks, generate one or multiple blocks of
+ * encoded data as specified by a matrix of GF(2^8) coefficients. When given a
+ * suitable set of coefficients, this function will perform the fast generation
+ * or decoding of Reed-Solomon type erasure codes.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param len Length of each block of data (vector) of source or dest data.
+ * @param k The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param rows The number of output vectors to concurrently encode/decode.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*k*rows
+ * @param data Array of pointers to source input buffers.
+ * @param coding Array of pointers to coded output buffers.
+ * @returns none
+ */
+
+void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+ unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data, runs baseline version.
+ *
+ * Baseline version of ec_encode_data() with same parameters.
+ */
+void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src,
+ unsigned char **dest);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version.
+ *
+ * Given one source data block, update one or multiple blocks of encoded data as
+ * specified by a matrix of GF(2^8) coefficients. When given a suitable set of
+ * coefficients, this function will perform the fast generation or decoding of
+ * Reed-Solomon type erasure codes from one input source at a time.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param len Length of each block of data (vector) of source or dest data.
+ * @param k The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param rows The number of output vectors to concurrently encode/decode.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param g_tbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*k*rows
+ * @param data Pointer to single input source used to update output parity.
+ * @param coding Array of pointers to coded output buffers.
+ * @returns none
+ */
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+ unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Baseline version of ec_encode_data_update().
+ */
+
+void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
+ unsigned char *data, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product, runs baseline version.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ * on the array of input coefficients. Only elements 32*CONST*j + 1
+ * of this array are used, where j = (0, 1, 2...) and CONST is the
+ * number of elements in the array of input coefficients. The
+ * elements used correspond to the original input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+
+void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product, runs appropriate version.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param len Length of each vector in bytes. Must be >= 32.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ * on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, runs appropriate version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param len Length of each vector in bytes. Must be >= 64.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, baseline version.
+ *
+ * Baseline version of gf_vect_mad() with same parameters.
+ */
+
+void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src,
+ unsigned char *dest);
+
+// x86 only
+#if defined(__i386__) || defined(__x86_64__)
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires SSE4.1
+ */
+void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+ unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX
+ */
+void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+ unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX2
+ */
+void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+ unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires SSE4.1
+ */
+
+void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+ unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX
+ */
+
+void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+ unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX2
+ */
+
+void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+ unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief GF(2^8) vector dot product.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ * on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires AVX
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ * on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires AVX2
+ *
+ * @param len Length of each vector in bytes. Must be >= 32.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ * on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs.
+ *
+ * Vector dot product optimized to calculate two outputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs.
+ *
+ * Vector dot product optimized to calculate two outputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs.
+ *
+ * Vector dot product optimized to calculate two outputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len Length of each vector in bytes. Must be >= 32.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs.
+ *
+ * Vector dot product optimized to calculate three outputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs.
+ *
+ * Vector dot product optimized to calculate three outputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs.
+ *
+ * Vector dot product optimized to calculate three outputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len Length of each vector in bytes. Must be >= 32.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs.
+ *
+ * Vector dot product optimized to calculate four outputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs.
+ *
+ * Vector dot product optimized to calculate four outputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs.
+ *
+ * Vector dot product optimized to calculate four outputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len Length of each vector in bytes. Must be >= 32.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs.
+ *
+ * Vector dot product optimized to calculate five outputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len Length of each vector in bytes. Must >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs.
+ *
+ * Vector dot product optimized to calculate five outputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len Length of each vector in bytes. Must >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs.
+ *
+ * Vector dot product optimized to calculate five outputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len Length of each vector in bytes. Must >= 32.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs.
+ *
+ * Vector dot product optimized to calculate six outputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs.
+ *
+ * Vector dot product optimized to calculate six outputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len Length of each vector in bytes. Must be >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs.
+ *
+ * Vector dot product optimized to calculate six outputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len Length of each vector in bytes. Must be >= 32.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires SSE4.1
+ */
+
+void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX
+ */
+
+void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX2
+ */
+
+void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char *dest);
+
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len Length of each vector in bytes. Must be >= 32.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse().
+ * @requires AVX
+ */
+void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len Length of each vector in bytes. Must be >= 32.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse().
+ * @requires AVX
+ */
+void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len Length of each vector in bytes. Must be >= 32.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse().
+ * @requires AVX
+ */
+void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+#endif
+
+/**********************************************************************
+ * The remaining are lib support functions used in GF(2^8) operations.
+ */
+
+/**
+ * @brief Single element GF(2^8) multiply.
+ *
+ * @param a Multiplicand a
+ * @param b Multiplicand b
+ * @returns Product of a and b in GF(2^8)
+ */
+
+unsigned char gf_mul(unsigned char a, unsigned char b);
+
+/**
+ * @brief Single element GF(2^8) inverse.
+ *
+ * @param a Input element
+ * @returns Field element b such that a x b = {1}
+ */
+
+unsigned char gf_inv(unsigned char a);
+
+/**
+ * @brief Generate a matrix of coefficients to be used for encoding.
+ *
+ * Vandermonde matrix example of encoding coefficients where high portion of
+ * matrix is identity matrix I and lower portion is constructed as 2^{i*(j-k+1)}
+ * i:{0,k-1} j:{k,m-1}. Commonly used method for choosing coefficients in
+ * erasure encoding but does not guarantee invertable for every sub matrix. For
+ * large pairs of m and k it is possible to find cases where the decode matrix
+ * chosen from sources and parity is not invertable. Users may want to adjust
+ * for certain pairs m and k. If m and k satisfy one of the following
+ * inequalities, no adjustment is required:
+ *
+ * - k <= 3
+ * - k = 4, m <= 25
+ * - k = 5, m <= 10
+ * - k <= 21, m-k = 4
+ * - m - k <= 3.
+ *
+ * @param a [m x k] array to hold coefficients
+ * @param m number of rows in matrix corresponding to srcs + parity.
+ * @param k number of columns in matrix corresponding to srcs.
+ * @returns none
+ */
+
+void gf_gen_rs_matrix(unsigned char *a, int m, int k);
+
+/**
+ * @brief Generate a Cauchy matrix of coefficients to be used for encoding.
+ *
+ * Cauchy matrix example of encoding coefficients where high portion of matrix
+ * is identity matrix I and lower portion is constructed as 1/(i + j) | i != j,
+ * i:{0,k-1} j:{k,m-1}. Any sub-matrix of a Cauchy matrix should be invertable.
+ *
+ * @param a [m x k] array to hold coefficients
+ * @param m number of rows in matrix corresponding to srcs + parity.
+ * @param k number of columns in matrix corresponding to srcs.
+ * @returns none
+ */
+
+void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k);
+
+/**
+ * @brief Invert a matrix in GF(2^8)
+ *
+ * Attempts to construct an n x n inverse of the input matrix. Returns non-zero
+ * if singular. Will always destroy input matrix in process.
+ *
+ * @param in input matrix, destroyed by invert process
+ * @param out output matrix such that [in] x [out] = [I] - identity matrix
+ * @param n size of matrix [nxn]
+ * @returns 0 successful, other fail on singular input matrix
+ */
+
+int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n);
+
+
+/*************************************************************/
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ERASURE_CODE_H_
diff --git a/src/isa-l/include/gf_vect_mul.h b/src/isa-l/include/gf_vect_mul.h
new file mode 100644
index 000000000..70a0ab2ed
--- /dev/null
+++ b/src/isa-l/include/gf_vect_mul.h
@@ -0,0 +1,152 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _GF_VECT_MUL_H
+#define _GF_VECT_MUL_H
+
+/**
+ * @file gf_vect_mul.h
+ * @brief Interface to functions for vector (block) multiplication in GF(2^8).
+ *
+ * This file defines the interface to routines used in fast RAID rebuild and
+ * erasure codes.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// x86 only
+#if defined(__i386__) || defined(__x86_64__)
+
+ /**
+ * @brief GF(2^8) vector multiply by constant.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
+ * and src must be aligned to 32B.
+ * @requires SSE4.1
+ *
+ * @param len Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src Pointer to src data array. Must be aligned to 32B.
+ * @param dest Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ */
+
+int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest);
+
+
+ /**
+ * @brief GF(2^8) vector multiply by constant.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
+ * and src must be aligned to 32B.
+ * @requires AVX
+ *
+ * @param len Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src Pointer to src data array. Must be aligned to 32B.
+ * @param dest Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ */
+
+int gf_vect_mul_avx(int len, unsigned char *gftbl, void *src, void *dest);
+
+#endif
+
+/**
+ * @brief GF(2^8) vector multiply by constant, runs appropriate version.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }.
+ * Len and src must be aligned to 32B.
+ *
+ * This function determines what instruction sets are enabled
+ * and selects the appropriate version at runtime.
+ *
+ * @param len Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src Pointer to src data array. Must be aligned to 32B.
+ * @param dest Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ */
+
+int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest);
+
+
+/**
+ * @brief Initialize 32-byte constant array for GF(2^8) vector multiply
+ *
+ * Calculates array {C{00}, C{01}, C{02}, ... , C{0f} }, {C{00}, C{10},
+ * C{20}, ... , C{f0} } as required by other fast vector multiply
+ * functions.
+ * @param c Constant input.
+ * @param gftbl Table output.
+ */
+
+void gf_vect_mul_init(unsigned char c, unsigned char* gftbl);
+
+
+/**
+ * @brief GF(2^8) vector multiply by constant, runs baseline version.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
+ * and src must be aligned to 32B.
+ *
+ * @param len Length of vector in bytes. Must be aligned to 32B.
+ * @param a Pointer to 32-byte array of pre-calculated constants based on C.
+ * only use 2nd element is used.
+ * @param src Pointer to src data array. Must be aligned to 32B.
+ * @param dest Pointer to destination data array. Must be aligned to 32B.
+ */
+
+void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src,
+ unsigned char *dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_GF_VECT_MUL_H
diff --git a/src/isa-l/include/igzip_lib.h b/src/isa-l/include/igzip_lib.h
new file mode 100644
index 000000000..57333748b
--- /dev/null
+++ b/src/isa-l/include/igzip_lib.h
@@ -0,0 +1,990 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _IGZIP_H
+#define _IGZIP_H
+
+/**
+ * @file igzip_lib.h
+ *
+ * @brief This file defines the igzip compression and decompression interface, a
+ * high performance deflate compression interface for storage applications.
+ *
+ * Deflate is a widely used compression standard that can be used standalone, it
+ * also forms the basis of gzip and zlib compression formats. Igzip supports the
+ * following flush features:
+ *
+ * - No Flush: The default method where no special flush is performed.
+ *
+ * - Sync flush: whereby isal_deflate() finishes the current deflate block at
+ * the end of each input buffer. The deflate block is byte aligned by
+ * appending an empty stored block.
+ *
+ * - Full flush: whereby isal_deflate() finishes and aligns the deflate block as
+ * in sync flush but also ensures that subsequent block's history does not
+ * look back beyond this point and new blocks are fully independent.
+ *
+ * Igzip also supports compression levels from ISAL_DEF_MIN_LEVEL to
+ * ISAL_DEF_MAX_LEVEL.
+ *
+ * Igzip contains some behavior configurable at compile time. These
+ * configurable options are:
+ *
+ * - IGZIP_HIST_SIZE - Defines the window size. The default value is 32K (note K
+ * represents 1024), but 8K is also supported. Powers of 2 which are at most
+ * 32K may also work.
+ *
+ * - LONGER_HUFFTABLES - Defines whether to use a larger hufftables structure
+ * which may increase performance with smaller IGZIP_HIST_SIZE values. By
+ * default this option is not defined. This define sets IGZIP_HIST_SIZE to be
+ * 8 if IGZIP_HIST_SIZE > 8K.
+ *
+ * As an example, to compile gzip with an 8K window size, in a terminal run
+ * @verbatim gmake D="-D IGZIP_HIST_SIZE=8*1024" @endverbatim on Linux and
+ * FreeBSD, or with @verbatim nmake -f Makefile.nmake D="-D
+ * IGZIP_HIST_SIZE=8*1024" @endverbatim on Windows.
+ *
+ */
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+/* Deflate Compression Standard Defines */
+/******************************************************************************/
+#define IGZIP_K 1024
+#define ISAL_DEF_MAX_HDR_SIZE 328
+#define ISAL_DEF_MAX_CODE_LEN 15
+#define ISAL_DEF_HIST_SIZE (32*IGZIP_K)
+#define ISAL_DEF_MAX_HIST_BITS 15
+#define ISAL_DEF_MAX_MATCH 258
+#define ISAL_DEF_MIN_MATCH 3
+
+#define ISAL_DEF_LIT_SYMBOLS 257
+#define ISAL_DEF_LEN_SYMBOLS 29
+#define ISAL_DEF_DIST_SYMBOLS 30
+#define ISAL_DEF_LIT_LEN_SYMBOLS (ISAL_DEF_LIT_SYMBOLS + ISAL_DEF_LEN_SYMBOLS)
+
+/* Max repeat length, rounded up to 32 byte boundary */
+#define ISAL_LOOK_AHEAD ((ISAL_DEF_MAX_MATCH + 31) & ~31)
+
+/******************************************************************************/
+/* Deflate Implementation Specific Defines */
+/******************************************************************************/
+/* Note IGZIP_HIST_SIZE must be a power of two */
+#ifndef IGZIP_HIST_SIZE
+#define IGZIP_HIST_SIZE ISAL_DEF_HIST_SIZE
+#endif
+
+#if (IGZIP_HIST_SIZE > ISAL_DEF_HIST_SIZE)
+#undef IGZIP_HIST_SIZE
+#define IGZIP_HIST_SIZE ISAL_DEF_HIST_SIZE
+#endif
+
+#ifdef LONGER_HUFFTABLE
+#if (IGZIP_HIST_SIZE > 8 * IGZIP_K)
+#undef IGZIP_HIST_SIZE
+#define IGZIP_HIST_SIZE (8 * IGZIP_K)
+#endif
+#endif
+
+#define ISAL_LIMIT_HASH_UPDATE
+
+#define IGZIP_HASH8K_HASH_SIZE (8 * IGZIP_K)
+#define IGZIP_HASH_HIST_SIZE IGZIP_HIST_SIZE
+#define IGZIP_HASH_MAP_HASH_SIZE IGZIP_HIST_SIZE
+
+#define IGZIP_LVL0_HASH_SIZE (8 * IGZIP_K)
+#define IGZIP_LVL1_HASH_SIZE IGZIP_HASH8K_HASH_SIZE
+#define IGZIP_LVL2_HASH_SIZE IGZIP_HASH_HIST_SIZE
+#define IGZIP_LVL3_HASH_SIZE IGZIP_HASH_MAP_HASH_SIZE
+
+#ifdef LONGER_HUFFTABLE
+enum {IGZIP_DIST_TABLE_SIZE = 8*1024};
+
+/* DECODE_OFFSET is dist code index corresponding to DIST_TABLE_SIZE + 1 */
+enum { IGZIP_DECODE_OFFSET = 26 };
+#else
+enum {IGZIP_DIST_TABLE_SIZE = 2};
+/* DECODE_OFFSET is dist code index corresponding to DIST_TABLE_SIZE + 1 */
+enum { IGZIP_DECODE_OFFSET = 0 };
+#endif
+enum {IGZIP_LEN_TABLE_SIZE = 256};
+enum {IGZIP_LIT_TABLE_SIZE = ISAL_DEF_LIT_SYMBOLS};
+
+#define IGZIP_HUFFTABLE_CUSTOM 0
+#define IGZIP_HUFFTABLE_DEFAULT 1
+#define IGZIP_HUFFTABLE_STATIC 2
+
+/* Flush Flags */
+#define NO_FLUSH 0 /* Default */
+#define SYNC_FLUSH 1
+#define FULL_FLUSH 2
+#define FINISH_FLUSH 0 /* Deprecated */
+
+/* Gzip Flags */
+#define IGZIP_DEFLATE 0 /* Default */
+#define IGZIP_GZIP 1
+#define IGZIP_GZIP_NO_HDR 2
+#define IGZIP_ZLIB 3
+#define IGZIP_ZLIB_NO_HDR 4
+
+/* Compression Return values */
+#define COMP_OK 0
+#define INVALID_FLUSH -7
+#define INVALID_PARAM -8
+#define STATELESS_OVERFLOW -1
+#define ISAL_INVALID_OPERATION -9
+#define ISAL_INVALID_STATE -3
+#define ISAL_INVALID_LEVEL -4 /* Invalid Compression level set */
+#define ISAL_INVALID_LEVEL_BUF -5 /* Invalid buffer specified for the compression level */
+
+/**
+ * @enum isal_zstate_state
+ * @brief Compression State please note ZSTATE_TRL only applies for GZIP compression
+ */
+
+
+/* When the state is set to ZSTATE_NEW_HDR or TMP_ZSTATE_NEW_HEADER, the
+ * hufftable being used for compression may be swapped
+ */
+enum isal_zstate_state {
+ ZSTATE_NEW_HDR, //!< Header to be written
+ ZSTATE_HDR, //!< Header state
+ ZSTATE_CREATE_HDR, //!< Header to be created
+ ZSTATE_BODY, //!< Body state
+ ZSTATE_FLUSH_READ_BUFFER, //!< Flush buffer
+ ZSTATE_FLUSH_ICF_BUFFER,
+ ZSTATE_TYPE0_HDR, //! Type0 block header to be written
+ ZSTATE_TYPE0_BODY, //!< Type0 block body to be written
+ ZSTATE_SYNC_FLUSH, //!< Write sync flush block
+ ZSTATE_FLUSH_WRITE_BUFFER, //!< Flush bitbuf
+ ZSTATE_TRL, //!< Trailer state
+ ZSTATE_END, //!< End state
+ ZSTATE_TMP_NEW_HDR, //!< Temporary Header to be written
+ ZSTATE_TMP_HDR, //!< Temporary Header state
+ ZSTATE_TMP_CREATE_HDR, //!< Temporary Header to be created state
+ ZSTATE_TMP_BODY, //!< Temporary Body state
+ ZSTATE_TMP_FLUSH_READ_BUFFER, //!< Flush buffer
+ ZSTATE_TMP_FLUSH_ICF_BUFFER,
+ ZSTATE_TMP_TYPE0_HDR, //! Temporary Type0 block header to be written
+ ZSTATE_TMP_TYPE0_BODY, //!< Temporary Type0 block body to be written
+ ZSTATE_TMP_SYNC_FLUSH, //!< Write sync flush block
+ ZSTATE_TMP_FLUSH_WRITE_BUFFER, //!< Flush bitbuf
+ ZSTATE_TMP_TRL, //!< Temporary Trailer state
+ ZSTATE_TMP_END //!< Temporary End state
+};
+
+/* Offset used to switch between TMP states and non-tmp states */
+#define ZSTATE_TMP_OFFSET ZSTATE_TMP_HDR - ZSTATE_HDR
+
+/******************************************************************************/
+/* Inflate Implementation Specific Defines */
+/******************************************************************************/
+#define ISAL_DECODE_LONG_BITS 12
+#define ISAL_DECODE_SHORT_BITS 10
+
+/* Current state of decompression */
+enum isal_block_state {
+ ISAL_BLOCK_NEW_HDR, /* Just starting a new block */
+ ISAL_BLOCK_HDR, /* In the middle of reading in a block header */
+ ISAL_BLOCK_TYPE0, /* Decoding a type 0 block */
+ ISAL_BLOCK_CODED, /* Decoding a huffman coded block */
+ ISAL_BLOCK_INPUT_DONE, /* Decompression of input is completed */
+ ISAL_BLOCK_FINISH, /* Decompression of input is completed and all data has been flushed to output */
+ ISAL_GZIP_EXTRA_LEN,
+ ISAL_GZIP_EXTRA,
+ ISAL_GZIP_NAME,
+ ISAL_GZIP_COMMENT,
+ ISAL_GZIP_HCRC,
+ ISAL_ZLIB_DICT,
+ ISAL_CHECKSUM_CHECK,
+};
+
+
+/* Inflate Flags */
+#define ISAL_DEFLATE 0 /* Default */
+#define ISAL_GZIP 1
+#define ISAL_GZIP_NO_HDR 2
+#define ISAL_ZLIB 3
+#define ISAL_ZLIB_NO_HDR 4
+#define ISAL_ZLIB_NO_HDR_VER 5
+#define ISAL_GZIP_NO_HDR_VER 6
+
+/* Inflate Return values */
+#define ISAL_DECOMP_OK 0 /* No errors encountered while decompressing */
+#define ISAL_END_INPUT 1 /* End of input reached */
+#define ISAL_OUT_OVERFLOW 2 /* End of output reached */
+#define ISAL_NAME_OVERFLOW 3 /* End of gzip name buffer reached */
+#define ISAL_COMMENT_OVERFLOW 4 /* End of gzip name buffer reached */
+#define ISAL_EXTRA_OVERFLOW 5 /* End of extra buffer reached */
+#define ISAL_NEED_DICT 6 /* Stream needs a dictionary to continue */
+#define ISAL_INVALID_BLOCK -1 /* Invalid deflate block found */
+#define ISAL_INVALID_SYMBOL -2 /* Invalid deflate symbol found */
+#define ISAL_INVALID_LOOKBACK -3 /* Invalid lookback distance found */
+#define ISAL_INVALID_WRAPPER -4 /* Invalid gzip/zlib wrapper found */
+#define ISAL_UNSUPPORTED_METHOD -5 /* Gzip/zlib wrapper specifies unsupported compress method */
+#define ISAL_INCORRECT_CHECKSUM -6 /* Incorrect checksum found */
+
+/******************************************************************************/
+/* Compression structures */
+/******************************************************************************/
+/** @brief Holds histogram of deflate symbols*/
+struct isal_huff_histogram {
+ uint64_t lit_len_histogram[ISAL_DEF_LIT_LEN_SYMBOLS]; //!< Histogram of Literal/Len symbols seen
+ uint64_t dist_histogram[ISAL_DEF_DIST_SYMBOLS]; //!< Histogram of Distance Symbols seen
+ uint16_t hash_table[IGZIP_LVL0_HASH_SIZE]; //!< Tmp space used as a hash table
+};
+
+struct isal_mod_hist {
+ uint32_t d_hist[30];
+ uint32_t ll_hist[513];
+};
+
+#define ISAL_DEF_MIN_LEVEL 0
+#define ISAL_DEF_MAX_LEVEL 3
+
+/* Defines used set level data sizes */
+/* has to be at least sizeof(struct level_buf) + sizeof(struct lvlX_buf */
+#define ISAL_DEF_LVL0_REQ 0
+#define ISAL_DEF_LVL1_REQ (4 * IGZIP_K + 2 * IGZIP_LVL1_HASH_SIZE)
+#define ISAL_DEF_LVL1_TOKEN_SIZE 4
+#define ISAL_DEF_LVL2_REQ (4 * IGZIP_K + 2 * IGZIP_LVL2_HASH_SIZE)
+#define ISAL_DEF_LVL2_TOKEN_SIZE 4
+#define ISAL_DEF_LVL3_REQ 4 * IGZIP_K + 4 * 4 * IGZIP_K + 2 * IGZIP_LVL3_HASH_SIZE
+#define ISAL_DEF_LVL3_TOKEN_SIZE 4
+
+/* Data sizes for level specific data options */
+#define ISAL_DEF_LVL0_MIN ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_SMALL ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_MEDIUM ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_LARGE ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_EXTRA_LARGE ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_DEFAULT ISAL_DEF_LVL0_REQ
+
+#define ISAL_DEF_LVL1_MIN (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 1 * IGZIP_K)
+#define ISAL_DEF_LVL1_SMALL (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 16 * IGZIP_K)
+#define ISAL_DEF_LVL1_MEDIUM (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 32 * IGZIP_K)
+#define ISAL_DEF_LVL1_LARGE (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 64 * IGZIP_K)
+#define ISAL_DEF_LVL1_EXTRA_LARGE (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 128 * IGZIP_K)
+#define ISAL_DEF_LVL1_DEFAULT ISAL_DEF_LVL1_LARGE
+
+#define ISAL_DEF_LVL2_MIN (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 1 * IGZIP_K)
+#define ISAL_DEF_LVL2_SMALL (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 16 * IGZIP_K)
+#define ISAL_DEF_LVL2_MEDIUM (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 32 * IGZIP_K)
+#define ISAL_DEF_LVL2_LARGE (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 64 * IGZIP_K)
+#define ISAL_DEF_LVL2_EXTRA_LARGE (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 128 * IGZIP_K)
+#define ISAL_DEF_LVL2_DEFAULT ISAL_DEF_LVL2_LARGE
+
+#define ISAL_DEF_LVL3_MIN (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 1 * IGZIP_K)
+#define ISAL_DEF_LVL3_SMALL (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 16 * IGZIP_K)
+#define ISAL_DEF_LVL3_MEDIUM (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 32 * IGZIP_K)
+#define ISAL_DEF_LVL3_LARGE (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 64 * IGZIP_K)
+#define ISAL_DEF_LVL3_EXTRA_LARGE (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 128 * IGZIP_K)
+#define ISAL_DEF_LVL3_DEFAULT ISAL_DEF_LVL3_LARGE
+
+#define IGZIP_NO_HIST 0
+#define IGZIP_HIST 1
+#define IGZIP_DICT_HIST 2
+#define IGZIP_DICT_HASH_SET 3
+
+/** @brief Holds Bit Buffer information*/
+struct BitBuf2 {
+ uint64_t m_bits; //!< bits in the bit buffer
+ uint32_t m_bit_count; //!< number of valid bits in the bit buffer
+ uint8_t *m_out_buf; //!< current index of buffer to write to
+ uint8_t *m_out_end; //!< end of buffer to write to
+ uint8_t *m_out_start; //!< start of buffer to write to
+};
+
+struct isal_zlib_header {
+ uint32_t info; //!< base-2 logarithm of the LZ77 window size minus 8
+ uint32_t level; //!< Compression level (fastest, fast, default, maximum)
+ uint32_t dict_id; //!< Dictionary id
+ uint32_t dict_flag; //!< Whether to use a dictionary
+};
+
+struct isal_gzip_header {
+ uint32_t text; //!< Optional Text hint
+ uint32_t time; //!< Unix modification time in gzip header
+ uint32_t xflags; //!< xflags in gzip header
+ uint32_t os; //!< OS in gzip header
+ uint8_t *extra; //!< Extra field in gzip header
+ uint32_t extra_buf_len; //!< Length of extra buffer
+ uint32_t extra_len; //!< Actual length of gzip header extra field
+ char *name; //!< Name in gzip header
+ uint32_t name_buf_len; //!< Length of name buffer
+ char *comment; //!< Comments in gzip header
+ uint32_t comment_buf_len; //!< Length of comment buffer
+ uint32_t hcrc; //!< Header crc or header crc flag
+ uint32_t flags; //!< Internal data
+};
+
+/* Variable prefixes:
+ * b_ : Measured wrt the start of the buffer
+ * f_ : Measured wrt the start of the file (aka file_start)
+ */
+
+/** @brief Holds the internal state information for input and output compression streams*/
+struct isal_zstate {
+ uint32_t total_in_start; //!< Not used, may be replaced with something else
+ uint32_t block_next; //!< Start of current deflate block in the input
+ uint32_t block_end; //!< End of current deflate block in the input
+ uint32_t dist_mask; //!< Distance mask used.
+ uint32_t hash_mask;
+ enum isal_zstate_state state; //!< Current state in processing the data stream
+ struct BitBuf2 bitbuf; //!< Bit Buffer
+ uint32_t crc; //!< Current checksum without finalize step if any (adler)
+ uint8_t has_wrap_hdr; //!< keeps track of wrapper header
+ uint8_t has_eob_hdr; //!< keeps track of eob hdr (with BFINAL set)
+ uint8_t has_eob; //!< keeps track of eob on the last deflate block
+ uint8_t has_hist; //!< flag to track if there is match history
+ uint16_t has_level_buf_init; //!< flag to track if user supplied memory has been initialized.
+ uint32_t count; //!< used for partial header/trailer writes
+ uint8_t tmp_out_buff[16]; //!< temporary array
+ uint32_t tmp_out_start; //!< temporary variable
+ uint32_t tmp_out_end; //!< temporary variable
+ uint32_t b_bytes_valid; //!< number of valid bytes in buffer
+ uint32_t b_bytes_processed; //!< number of bytes processed in buffer
+ uint8_t buffer[2 * IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD]; //!< Internal buffer
+
+ /* Stream should be setup such that the head is cache aligned*/
+ uint16_t head[IGZIP_LVL0_HASH_SIZE]; //!< Hash array
+};
+
+/** @brief Holds the huffman tree used to huffman encode the input stream **/
+struct isal_hufftables {
+
+ uint8_t deflate_hdr[ISAL_DEF_MAX_HDR_SIZE]; //!< deflate huffman tree header
+ uint32_t deflate_hdr_count; //!< Number of whole bytes in deflate_huff_hdr
+ uint32_t deflate_hdr_extra_bits; //!< Number of bits in the partial byte in header
+ uint32_t dist_table[IGZIP_DIST_TABLE_SIZE]; //!< bits 4:0 are the code length, bits 31:5 are the code
+ uint32_t len_table[IGZIP_LEN_TABLE_SIZE]; //!< bits 4:0 are the code length, bits 31:5 are the code
+ uint16_t lit_table[IGZIP_LIT_TABLE_SIZE]; //!< literal code
+ uint8_t lit_table_sizes[IGZIP_LIT_TABLE_SIZE]; //!< literal code length
+ uint16_t dcodes[30 - IGZIP_DECODE_OFFSET]; //!< distance code
+ uint8_t dcodes_sizes[30 - IGZIP_DECODE_OFFSET]; //!< distance code length
+
+};
+
+/** @brief Holds stream information*/
+struct isal_zstream {
+ uint8_t *next_in; //!< Next input byte
+ uint32_t avail_in; //!< number of bytes available at next_in
+ uint32_t total_in; //!< total number of bytes read so far
+
+ uint8_t *next_out; //!< Next output byte
+ uint32_t avail_out; //!< number of bytes available at next_out
+ uint32_t total_out; //!< total number of bytes written so far
+
+ struct isal_hufftables *hufftables; //!< Huffman encoding used when compressing
+ uint32_t level; //!< Compression level to use
+ uint32_t level_buf_size; //!< Size of level_buf
+ uint8_t * level_buf; //!< User allocated buffer required for different compression levels
+ uint16_t end_of_stream; //!< non-zero if this is the last input buffer
+ uint16_t flush; //!< Flush type can be NO_FLUSH, SYNC_FLUSH or FULL_FLUSH
+ uint16_t gzip_flag; //!< Indicate if gzip compression is to be performed
+ uint16_t hist_bits; //!< Log base 2 of maximum lookback distance, 0 is use default
+ struct isal_zstate internal_state; //!< Internal state for this stream
+};
+
+/******************************************************************************/
+/* Inflate structures */
+/******************************************************************************/
+/*
+ * Inflate_huff_code data structures are used to store a Huffman code for fast
+ * lookup. It works by performing a lookup in small_code_lookup that hopefully
+ * yields the correct symbol. Otherwise a lookup into long_code_lookup is
+ * performed to find the correct symbol. The details of how this works follows:
+ *
+ * Let i be some index into small_code_lookup and let e be the associated
+ * element. Bit 15 in e is a flag. If bit 15 is not set, then index i contains
+ * a Huffman code for a symbol which has length at most DECODE_LOOKUP_SIZE. Bits
+ * 0 through 8 are the symbol associated with that code and bits 9 through 12 of
+ * e represent the number of bits in the code. If bit 15 is set, the i
+ * corresponds to the first DECODE_LOOKUP_SIZE bits of a Huffman code which has
+ * length longer than DECODE_LOOKUP_SIZE. In this case, bits 0 through 8
+ * represent an offset into long_code_lookup table and bits 9 through 12
+ * represent the maximum length of a Huffman code starting with the bits in the
+ * index i. The offset into long_code_lookup is for an array associated with all
+ * codes which start with the bits in i.
+ *
+ * The elements of long_code_lookup are in the same format as small_code_lookup,
+ * except bit 15 is never set. Let i be a number made up of DECODE_LOOKUP_SIZE
+ * bits. Then all Huffman codes which start with DECODE_LOOKUP_SIZE bits are
+ * stored in an array starting at index h in long_code_lookup. This index h is
+ * stored in bits 0 through 9 at index i in small_code_lookup. The index j is an
+ * index of this array if the number of bits contained in j and i is the number
+ * of bits in the longest huff_code starting with the bits of i. The symbol
+ * stored at index j is the symbol whose huffcode can be found in (j <<
+ * DECODE_LOOKUP_SIZE) | i. Note these arrays will be stored sorted in order of
+ * maximum Huffman code length.
+ *
+ * The following are explanations for sizes of the tables:
+ *
+ * Since small_code_lookup is a lookup on DECODE_LOOKUP_SIZE bits, it must have
+ * size 2^DECODE_LOOKUP_SIZE.
+ *
+ * To determine the amount of memory required for long_code_lookup, note that
+ * any element of long_code_lookup corresponds to a code, a duplicate of an
+ * existing code, or a invalid code. Since deflate Huffman are stored such that
+ * the code size and the code value form an increasing function, the number of
+ * duplicates is maximized when all the duplicates are contained in a single
+ * array, thus there are at most 2^(15 - DECODE_LOOKUP_SIZE) -
+ * (DECODE_LOOKUP_SIZE + 1) duplicate elements. Similarly the number of invalid
+ * elements is maximized at 2^(15 - DECODE_LOOKUP_SIZE) - 2^(floor((15 -
+ * DECODE_LOOKUP_SIZE)/2) - 2^(ceil((15 - DECODE_LOOKUP_SIZE)/2) + 1. Thus the
+ * amount of memory required is: NUM_CODES + 2^(16 - DECODE_LOOKUP_SIZE) -
+ * (DECODE_LOOKUP_SIZE + 1) - 2^(floor((15 - DECODE_LOOKUP_SIZE)/2) -
+ * 2^(ceil((15 - DECODE_LOOKUP_SIZE)/2) + 1. The values used below are those
+ * values rounded up to the nearest 16 byte boundary
+ *
+ * Note that DECODE_LOOKUP_SIZE can be any length even though the offset in
+ * small_lookup_code is 9 bits long because the increasing relationship between
+ * code length and code value forces the maximum offset to be less than 288.
+ */
+
+/* In the following defines, L stands for LARGE and S for SMALL */
+#define ISAL_L_REM (21 - ISAL_DECODE_LONG_BITS)
+#define ISAL_S_REM (15 - ISAL_DECODE_SHORT_BITS)
+
+#define ISAL_L_DUP ((1 << ISAL_L_REM) - (ISAL_L_REM + 1))
+#define ISAL_S_DUP ((1 << ISAL_S_REM) - (ISAL_S_REM + 1))
+
+#define ISAL_L_UNUSED ((1 << ISAL_L_REM) - (1 << ((ISAL_L_REM)/2)) - (1 << ((ISAL_L_REM + 1)/2)) + 1)
+#define ISAL_S_UNUSED ((1 << ISAL_S_REM) - (1 << ((ISAL_S_REM)/2)) - (1 << ((ISAL_S_REM + 1)/2)) + 1)
+
+#define ISAL_L_SIZE (ISAL_DEF_LIT_LEN_SYMBOLS + ISAL_L_DUP + ISAL_L_UNUSED)
+#define ISAL_S_SIZE (ISAL_DEF_DIST_SYMBOLS + ISAL_S_DUP + ISAL_S_UNUSED)
+
+#define ISAL_HUFF_CODE_LARGE_LONG_ALIGNED (ISAL_L_SIZE + (-ISAL_L_SIZE & 0xf))
+#define ISAL_HUFF_CODE_SMALL_LONG_ALIGNED (ISAL_S_SIZE + (-ISAL_S_SIZE & 0xf))
+
+/* Large lookup table for decoding huffman codes */
+struct inflate_huff_code_large {
+ uint32_t short_code_lookup[1 << (ISAL_DECODE_LONG_BITS)];
+ uint16_t long_code_lookup[ISAL_HUFF_CODE_LARGE_LONG_ALIGNED];
+};
+
+/* Small lookup table for decoding huffman codes */
+struct inflate_huff_code_small {
+ uint16_t short_code_lookup[1 << (ISAL_DECODE_SHORT_BITS)];
+ uint16_t long_code_lookup[ISAL_HUFF_CODE_SMALL_LONG_ALIGNED];
+};
+
+/** @brief Holds decompression state information*/
+struct inflate_state {
+ uint8_t *next_out; //!< Next output Byte
+ uint32_t avail_out; //!< Number of bytes available at next_out
+ uint32_t total_out; //!< Total bytes written out so far
+ uint8_t *next_in; //!< Next input byte
+ uint64_t read_in; //!< Bits buffered to handle unaligned streams
+ uint32_t avail_in; //!< Number of bytes available at next_in
+ int32_t read_in_length; //!< Bits in read_in
+ struct inflate_huff_code_large lit_huff_code; //!< Structure for decoding lit/len symbols
+ struct inflate_huff_code_small dist_huff_code; //!< Structure for decoding dist symbols
+ enum isal_block_state block_state; //!< Current decompression state
+ uint32_t dict_length; //!< Length of dictionary used
+ uint32_t bfinal; //!< Flag identifying final block
+ uint32_t crc_flag; //!< Flag identifying whether to track of crc
+ uint32_t crc; //!< Contains crc or adler32 of output if crc_flag is set
+ uint32_t hist_bits; //!< Log base 2 of maximum lookback distance
+ union {
+ int32_t type0_block_len; //!< Length left to read of type 0 block when outbuffer overflow occurred
+ int32_t count; //!< Count of bytes remaining to be parsed
+ uint32_t dict_id;
+ };
+ int32_t write_overflow_lits;
+ int32_t write_overflow_len;
+ int32_t copy_overflow_length; //!< Length left to copy when outbuffer overflow occurred
+ int32_t copy_overflow_distance; //!< Lookback distance when outbuffer overflow occurred
+ int16_t wrapper_flag;
+ int16_t tmp_in_size; //!< Number of bytes in tmp_in_buffer
+ int32_t tmp_out_valid; //!< Number of bytes in tmp_out_buffer
+ int32_t tmp_out_processed; //!< Number of bytes processed in tmp_out_buffer
+ uint8_t tmp_in_buffer[ISAL_DEF_MAX_HDR_SIZE]; //!< Temporary buffer containing data from the input stream
+ uint8_t tmp_out_buffer[2 * ISAL_DEF_HIST_SIZE + ISAL_LOOK_AHEAD]; //!< Temporary buffer containing data from the output stream
+};
+
+/******************************************************************************/
+/* Compression functions */
+/******************************************************************************/
+/**
+ * @brief Updates histograms to include the symbols found in the input
+ * stream. Since this function only updates the histograms, it can be called on
+ * multiple streams to get a histogram better representing the desired data
+ * set. When first using histogram it must be initialized by zeroing the
+ * structure.
+ *
+ * @param in_stream: Input stream of data.
+ * @param length: The length of start_stream.
+ * @param histogram: The returned histogram of lit/len/dist symbols.
+ */
+void isal_update_histogram(uint8_t * in_stream, int length, struct isal_huff_histogram * histogram);
+
+
+/**
+ * @brief Creates a custom huffman code for the given histograms in which
+ * every literal and repeat length is assigned a code and all possible lookback
+ * distances are assigned a code.
+ *
+ * @param hufftables: the output structure containing the huffman code
+ * @param histogram: histogram containing frequency of literal symbols,
+ * repeat lengths and lookback distances
+ * @returns Returns a non zero value if an invalid huffman code was created.
+ */
+int isal_create_hufftables(struct isal_hufftables * hufftables,
+ struct isal_huff_histogram * histogram);
+
+/**
+ * @brief Creates a custom huffman code for the given histograms like
+ * isal_create_hufftables() except literals with 0 frequency in the histogram
+ * are not assigned a code
+ *
+ * @param hufftables: the output structure containing the huffman code
+ * @param histogram: histogram containing frequency of literal symbols,
+ * repeat lengths and lookback distances
+ * @returns Returns a non zero value if an invalid huffman code was created.
+ */
+int isal_create_hufftables_subset(struct isal_hufftables * hufftables,
+ struct isal_huff_histogram * histogram);
+
+/**
+ * @brief Initialize compression stream data structure
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @returns none
+ */
+void isal_deflate_init(struct isal_zstream *stream);
+
+/**
+ * @brief Reinitialize compression stream data structure. Performs the same
+ * action as isal_deflate_init, but does not change user supplied input such as
+ * the level, flush type, compression wrapper (like gzip), hufftables, and
+ * end_of_stream_flag.
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @returns none
+ */
+void isal_deflate_reset(struct isal_zstream *stream);
+
+
+/**
+ * @brief Set gzip header default values
+ *
+ * @param gz_hdr: Gzip header to initialize.
+ */
+void isal_gzip_header_init(struct isal_gzip_header *gz_hdr);
+
+/**
+ * @brief Write gzip header to output stream
+ *
+ * Writes the gzip header to the output stream. On entry this function assumes
+ * that the output buffer has been initialized, so stream->next_out,
+ * stream->avail_out and stream->total_out have been set. If the output buffer
+ * contains insufficient space, stream is not modified.
+ *
+ * @param stream: Structure holding state information on the compression stream.
+ * @param gz_hdr: Structure holding the gzip header information to encode.
+ *
+ * @returns Returns 0 if the header is successfully written, otherwise returns
+ * the minimum size required to successfully write the gzip header to the output
+ * buffer.
+ */
+uint32_t isal_write_gzip_header(struct isal_zstream * stream, struct isal_gzip_header *gz_hdr);
+
+/**
+ * @brief Write zlib header to output stream
+ *
+ * Writes the zlib header to the output stream. On entry this function assumes
+ * that the output buffer has been initialized, so stream->next_out,
+ * stream->avail_out and stream->total_out have been set. If the output buffer
+ * contains insufficient space, stream is not modified.
+ *
+ * @param stream: Structure holding state information on the compression stream.
+ * @param z_hdr: Structure holding the zlib header information to encode.
+ *
+ * @returns Returns 0 if the header is successfully written, otherwise returns
+ * the minimum size required to successfully write the zlib header to the output
+ * buffer.
+ */
+uint32_t isal_write_zlib_header(struct isal_zstream * stream, struct isal_zlib_header *z_hdr);
+
+/**
+ * @brief Set stream to use a new Huffman code
+ *
+ * Sets the Huffman code to be used in compression before compression start or
+ * after the successful completion of a SYNC_FLUSH or FULL_FLUSH. If type has
+ * value IGZIP_HUFFTABLE_DEFAULT, the stream is set to use the default Huffman
+ * code. If type has value IGZIP_HUFFTABLE_STATIC, the stream is set to use the
+ * deflate standard static Huffman code, or if type has value
+ * IGZIP_HUFFTABLE_CUSTOM, the stream is set to sue the isal_hufftables
+ * structure input to isal_deflate_set_hufftables.
+ *
+ * @param stream: Structure holding state information on the compression stream.
+ * @param hufftables: new huffman code to use if type is set to
+ * IGZIP_HUFFTABLE_CUSTOM.
+ * @param type: Flag specifying what hufftable to use.
+ *
+ * @returns Returns INVALID_OPERATION if the stream was unmodified. This may be
+ * due to the stream being in a state where changing the huffman code is not
+ * allowed or an invalid input is provided.
+ */
+int isal_deflate_set_hufftables(struct isal_zstream *stream,
+ struct isal_hufftables *hufftables, int type);
+
+/**
+ * @brief Initialize compression stream data structure
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @returns none
+ */
+void isal_deflate_stateless_init(struct isal_zstream *stream);
+
+
+/**
+ * @brief Set compression dictionary to use
+ *
+ * This function is to be called after isal_deflate_init, or after completing a
+ * SYNC_FLUSH or FULL_FLUSH and before the next call do isal_deflate. If the
+ * dictionary is longer than IGZIP_HIST_SIZE, only the last IGZIP_HIST_SIZE
+ * bytes will be used.
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @param dict: Array containing dictionary to use.
+ * @param dict_len: Length of dict.
+ * @returns COMP_OK,
+ * ISAL_INVALID_STATE (dictionary could not be set)
+ */
+int isal_deflate_set_dict(struct isal_zstream *stream, uint8_t *dict, uint32_t dict_len);
+
+/** @brief Structure for holding processed dictionary information */
+
+struct isal_dict {
+ uint32_t params;
+ uint32_t level;
+ uint32_t hist_size;
+ uint32_t hash_size;
+ uint8_t history[ISAL_DEF_HIST_SIZE];
+ uint16_t hashtable[IGZIP_LVL3_HASH_SIZE];
+};
+
+/**
+ * @brief Process dictionary to reuse later
+ *
+ * Processes a dictionary so that the generated output can be reused to reset a
+ * new deflate stream more quickly than isal_deflate_set_dict() alone. This
+ * function is paired with isal_deflate_reset_dict() when using the same
+ * dictionary on multiple deflate objects. The stream.level must be set prior to
+ * calling this function to process the dictionary correctly. If the dictionary
+ * is longer than IGZIP_HIST_SIZE, only the last IGZIP_HIST_SIZE bytes will be
+ * used.
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @param dict_str: Structure to hold processed dictionary info to reuse later.
+ * @param dict: Array containing dictionary to use.
+ * @param dict_len: Length of dict.
+ * @returns COMP_OK,
+ * ISAL_INVALID_STATE (dictionary could not be processed)
+ */
+int isal_deflate_process_dict(struct isal_zstream *stream, struct isal_dict *dict_str,
+ uint8_t *dict, uint32_t dict_len);
+
+/**
+ * @brief Reset compression dictionary to use
+ *
+ * Similar to isal_deflate_set_dict() but on pre-processed dictionary
+ * data. Pairing with isal_deflate_process_dict() can reduce the processing time
+ * on subsequent compression with dictionary especially on small files.
+ *
+ * Like isal_deflate_set_dict(), this function is to be called after
+ * isal_deflate_init, or after completing a SYNC_FLUSH or FULL_FLUSH and before
+ * the next call do isal_deflate. Changing compression level between dictionary
+ * process and reset will cause return of ISAL_INVALID_STATE.
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @param dict_str: Structure with pre-processed dictionary info.
+ * @returns COMP_OK,
+ * ISAL_INVALID_STATE or other (dictionary could not be reset)
+ */
+int isal_deflate_reset_dict(struct isal_zstream *stream, struct isal_dict *dict_str);
+
+
+/**
+ * @brief Fast data (deflate) compression for storage applications.
+ *
+ * The call to isal_deflate() will take data from the input buffer (updating
+ * next_in, avail_in and write a compressed stream to the output buffer
+ * (updating next_out and avail_out). The function returns when either the input
+ * buffer is empty or the output buffer is full.
+ *
+ * On entry to isal_deflate(), next_in points to an input buffer and avail_in
+ * indicates the length of that buffer. Similarly next_out points to an empty
+ * output buffer and avail_out indicates the size of that buffer.
+ *
+ * The fields total_in and total_out start at 0 and are updated by
+ * isal_deflate(). These reflect the total number of bytes read or written so far.
+ *
+ * When the last input buffer is passed in, signaled by setting the
+ * end_of_stream, the routine will complete compression at the end of the input
+ * buffer, as long as the output buffer is big enough.
+ *
+ * The compression level can be set by setting level to any value between
+ * ISAL_DEF_MIN_LEVEL and ISAL_DEF_MAX_LEVEL. When the compression level is
+ * ISAL_DEF_MIN_LEVEL, hufftables can be set to a table trained for the the
+ * specific data type being compressed to achieve better compression. When a
+ * higher compression level is desired, a larger generic memory buffer needs to
+ * be supplied by setting level_buf and level_buf_size to represent the chunk of
+ * memory. For level x, the suggest size for this buffer this buffer is
+ * ISAL_DEFL_LVLx_DEFAULT. The defines ISAL_DEFL_LVLx_MIN, ISAL_DEFL_LVLx_SMALL,
+ * ISAL_DEFL_LVLx_MEDIUM, ISAL_DEFL_LVLx_LARGE, and ISAL_DEFL_LVLx_EXTRA_LARGE
+ * are also provided as other suggested sizes.
+ *
+ * The equivalent of the zlib FLUSH_SYNC operation is currently supported.
+ * Flush types can be NO_FLUSH, SYNC_FLUSH or FULL_FLUSH. Default flush type is
+ * NO_FLUSH. A SYNC_ OR FULL_ flush will byte align the deflate block by
+ * appending an empty stored block once all input has been compressed, including
+ * the buffered input. Checking that the out_buffer is not empty or that
+ * internal_state.state = ZSTATE_NEW_HDR is sufficient to guarantee all input
+ * has been flushed. Additionally FULL_FLUSH will ensure look back history does
+ * not include previous blocks so new blocks are fully independent. Switching
+ * between flush types is supported.
+ *
+ * If a compression dictionary is required, the dictionary can be set calling
+ * isal_deflate_set_dictionary before calling isal_deflate.
+ *
+ * If the gzip_flag is set to IGZIP_GZIP, a generic gzip header and the gzip
+ * trailer are written around the deflate compressed data. If gzip_flag is set
+ * to IGZIP_GZIP_NO_HDR, then only the gzip trailer is written. A full-featured
+ * header is supported by the isal_write_{gzip,zlib}_header() functions.
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @return COMP_OK (if everything is ok),
+ * INVALID_FLUSH (if an invalid FLUSH is selected),
+ * ISAL_INVALID_LEVEL (if an invalid compression level is selected),
+ * ISAL_INVALID_LEVEL_BUF (if the level buffer is not large enough).
+ */
+int isal_deflate(struct isal_zstream *stream);
+
+
+/**
+ * @brief Fast data (deflate) stateless compression for storage applications.
+ *
+ * Stateless (one shot) compression routine with a similar interface to
+ * isal_deflate() but operates on entire input buffer at one time. Parameter
+ * avail_out must be large enough to fit the entire compressed output. Max
+ * expansion is limited to the input size plus the header size of a stored/raw
+ * block.
+ *
+ * When the compression level is set to 1, unlike in isal_deflate(), level_buf
+ * may be optionally set depending on what what performance is desired.
+ *
+ * For stateless the flush types NO_FLUSH and FULL_FLUSH are supported.
+ * FULL_FLUSH will byte align the output deflate block so additional blocks can
+ * be easily appended.
+ *
+ * If the gzip_flag is set to IGZIP_GZIP, a generic gzip header and the gzip
+ * trailer are written around the deflate compressed data. If gzip_flag is set
+ * to IGZIP_GZIP_NO_HDR, then only the gzip trailer is written.
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @return COMP_OK (if everything is ok),
+ * INVALID_FLUSH (if an invalid FLUSH is selected),
+ * ISAL_INVALID_LEVEL (if an invalid compression level is selected),
+ * ISAL_INVALID_LEVEL_BUF (if the level buffer is not large enough),
+ * STATELESS_OVERFLOW (if output buffer will not fit output).
+ */
+int isal_deflate_stateless(struct isal_zstream *stream);
+
+
+/******************************************************************************/
+/* Inflate functions */
+/******************************************************************************/
+/**
+ * @brief Initialize decompression state data structure
+ *
+ * @param state Structure holding state information on the compression streams.
+ * @returns none
+ */
+void isal_inflate_init(struct inflate_state *state);
+
+/**
+ * @brief Reinitialize decompression state data structure
+ *
+ * @param state Structure holding state information on the compression streams.
+ * @returns none
+ */
+void isal_inflate_reset(struct inflate_state *state);
+
+/**
+ * @brief Set decompression dictionary to use
+ *
+ * This function is to be called after isal_inflate_init. If the dictionary is
+ * longer than IGZIP_HIST_SIZE, only the last IGZIP_HIST_SIZE bytes will be
+ * used.
+ *
+ * @param state: Structure holding state information on the decompression stream.
+ * @param dict: Array containing dictionary to use.
+ * @param dict_len: Length of dict.
+ * @returns COMP_OK,
+ * ISAL_INVALID_STATE (dictionary could not be set)
+ */
+int isal_inflate_set_dict(struct inflate_state *state, uint8_t *dict, uint32_t dict_len);
+
+/**
+ * @brief Read and return gzip header information
+ *
+ * On entry state must be initialized and next_in pointing to a gzip compressed
+ * buffer. The buffers gz_hdr->extra, gz_hdr->name, gz_hdr->comments and the
+ * buffer lengths must be set to record the corresponding field, or set to NULL
+ * to disregard that gzip header information. If one of these buffers overflows,
+ * the user can reallocate a larger buffer and call this function again to
+ * continue reading the header information.
+ *
+ * @param state: Structure holding state information on the decompression stream.
+ * @param gz_hdr: Structure to return data encoded in the gzip header
+ * @returns ISAL_DECOMP_OK (header was successfully parsed)
+ * ISAL_END_INPUT (all input was parsed),
+ * ISAL_NAME_OVERFLOW (gz_hdr->name overflowed while parsing),
+ * ISAL_COMMENT_OVERFLOW (gz_hdr->comment overflowed while parsing),
+ * ISAL_EXTRA_OVERFLOW (gz_hdr->extra overflowed while parsing),
+ * ISAL_INVALID_WRAPPER (invalid gzip header found),
+ * ISAL_UNSUPPORTED_METHOD (deflate is not the compression method),
+ * ISAL_INCORRECT_CHECKSUM (gzip header checksum was incorrect)
+ */
+int isal_read_gzip_header (struct inflate_state *state, struct isal_gzip_header *gz_hdr);
+
+/**
+ * @brief Read and return zlib header information
+ *
+ * On entry state must be initialized and next_in pointing to a zlib compressed
+ * buffer.
+ *
+ * @param state: Structure holding state information on the decompression stream.
+ * @param zlib_hdr: Structure to return data encoded in the zlib header
+ * @returns ISAL_DECOMP_OK (header was successfully parsed),
+ * ISAL_END_INPUT (all input was parsed),
+ * ISAL_UNSUPPORTED_METHOD (deflate is not the compression method),
+ * ISAL_INCORRECT_CHECKSUM (zlib header checksum was incorrect)
+ */
+int isal_read_zlib_header (struct inflate_state *state, struct isal_zlib_header *zlib_hdr);
+
+/**
+ * @brief Fast data (deflate) decompression for storage applications.
+ *
+ * On entry to isal_inflate(), next_in points to an input buffer and avail_in
+ * indicates the length of that buffer. Similarly next_out points to an empty
+ * output buffer and avail_out indicates the size of that buffer.
+ *
+ * The field total_out starts at 0 and is updated by isal_inflate(). This
+ * reflects the total number of bytes written so far.
+ *
+ * The call to isal_inflate() will take data from the input buffer (updating
+ * next_in, avail_in and write a decompressed stream to the output buffer
+ * (updating next_out and avail_out). The function returns when the input buffer
+ * is empty, the output buffer is full, invalid data is found, or in the case of
+ * zlib formatted data if a dictionary is specified. The current state of the
+ * decompression on exit can be read from state->block-state.
+ *
+ * If the crc_flag is set to ISAL_GZIP_NO_HDR the gzip crc of the output is
+ * stored in state->crc. Alternatively, if the crc_flag is set to
+ * ISAL_ZLIB_NO_HDR the adler32 of the output is stored in state->crc (checksum
+ * may not be updated until decompression is complete). When the crc_flag is set
+ * to ISAL_GZIP_NO_HDR_VER or ISAL_ZLIB_NO_HDR_VER, the behavior is the same,
+ * except the checksum is verified with the checksum after immediately following
+ * the deflate data. If the crc_flag is set to ISAL_GZIP or ISAL_ZLIB, the
+ * gzip/zlib header is parsed, state->crc is set to the appropriate checksum,
+ * and the checksum is verified. If the crc_flag is set to ISAL_DEFLATE
+ * (default), then the data is treated as a raw deflate block.
+ *
+ * The element state->hist_bits has values from 0 to 15, where values of 1 to 15
+ * are the log base 2 size of the matching window and 0 is the default with
+ * maximum history size.
+ *
+ * If a dictionary is required, a call to isal_inflate_set_dict will set the
+ * dictionary.
+ *
+ * @param state Structure holding state information on the compression streams.
+ * @return ISAL_DECOMP_OK (if everything is ok),
+ * ISAL_INVALID_BLOCK,
+ * ISAL_NEED_DICT,
+ * ISAL_INVALID_SYMBOL,
+ * ISAL_INVALID_LOOKBACK,
+ * ISAL_INVALID_WRAPPER,
+ * ISAL_UNSUPPORTED_METHOD,
+ * ISAL_INCORRECT_CHECKSUM.
+ */
+
+int isal_inflate(struct inflate_state *state);
+
+/**
+ * @brief Fast data (deflate) stateless decompression for storage applications.
+ *
+ * Stateless (one shot) decompression routine with a similar interface to
+ * isal_inflate() but operates on entire input buffer at one time. Parameter
+ * avail_out must be large enough to fit the entire decompressed
+ * output. Dictionaries are not supported.
+ *
+ * @param state Structure holding state information on the compression streams.
+ * @return ISAL_DECOMP_OK (if everything is ok),
+ * ISAL_END_INPUT (if all input was decompressed),
+ * ISAL_NEED_DICT,
+ * ISAL_OUT_OVERFLOW (if output buffer ran out of space),
+ * ISAL_INVALID_BLOCK,
+ * ISAL_INVALID_SYMBOL,
+ * ISAL_INVALID_LOOKBACK,
+ * ISAL_INVALID_WRAPPER,
+ * ISAL_UNSUPPORTED_METHOD,
+ * ISAL_INCORRECT_CHECKSUM.
+ */
+int isal_inflate_stateless(struct inflate_state *state);
+
+/******************************************************************************/
+/* Other functions */
+/******************************************************************************/
+/**
+ * @brief Calculate Adler-32 checksum, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param init: initial Adler-32 value
+ * @param buf: buffer to calculate checksum on
+ * @param len: buffer length in bytes
+ *
+ * @returns 32-bit Adler-32 checksum
+ */
+uint32_t isal_adler32(uint32_t init, const unsigned char *buf, uint64_t len);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* ifndef _IGZIP_H */
diff --git a/src/isa-l/include/mem_routines.h b/src/isa-l/include/mem_routines.h
new file mode 100644
index 000000000..3d23522e9
--- /dev/null
+++ b/src/isa-l/include/mem_routines.h
@@ -0,0 +1,64 @@
+/**********************************************************************
+ Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stddef.h>
+
+/**
+ * @file mem_routines.h
+ * @brief Interface to storage mem operations
+ *
+ * Defines the interface for vector versions of common memory functions.
+ */
+
+
+#ifndef _MEM_ROUTINES_H_
+#define _MEM_ROUTINES_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Detect if a memory region is all zero
+ *
+ * Zero detect function with optimizations for large blocks > 128 bytes
+ *
+ * @param mem Pointer to memory region to test
+ * @param len Length of region in bytes
+ * @returns 0 - region is all zeros
+ * other - region has non zero bytes
+ */
+int isal_zero_detect(void *mem, size_t len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _MEM_ROUTINES_H_
+
diff --git a/src/isa-l/include/multibinary.asm b/src/isa-l/include/multibinary.asm
new file mode 100644
index 000000000..588352a2f
--- /dev/null
+++ b/src/isa-l/include/multibinary.asm
@@ -0,0 +1,440 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _MULTIBINARY_ASM_
+%define _MULTIBINARY_ASM_
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ %define mbin_def_ptr dd
+ %define mbin_ptr_sz dword
+ %define mbin_rdi edi
+ %define mbin_rsi esi
+ %define mbin_rax eax
+ %define mbin_rbx ebx
+ %define mbin_rcx ecx
+ %define mbin_rdx edx
+%else
+ %define mbin_def_ptr dq
+ %define mbin_ptr_sz qword
+ %define mbin_rdi rdi
+ %define mbin_rsi rsi
+ %define mbin_rax rax
+ %define mbin_rbx rbx
+ %define mbin_rcx rcx
+ %define mbin_rdx rdx
+%endif
+
+%ifndef AS_FEATURE_LEVEL
+%define AS_FEATURE_LEVEL 4
+%endif
+
+;;;;
+; multibinary macro:
+; creates the visable entry point that uses HW optimized call pointer
+; creates the init of the HW optimized call pointer
+;;;;
+%macro mbin_interface 1
+ ;;;;
+ ; *_dispatched is defaulted to *_mbinit and replaced on first call.
+ ; Therefore, *_dispatch_init is only executed on first call.
+ ;;;;
+ section .data
+ %1_dispatched:
+ mbin_def_ptr %1_mbinit
+
+ section .text
+ mk_global %1, function
+ %1_mbinit:
+ endbranch
+ ;;; only called the first time to setup hardware match
+ call %1_dispatch_init
+ ;;; falls thru to execute the hw optimized code
+ %1:
+ endbranch
+ jmp mbin_ptr_sz [%1_dispatched]
+%endmacro
+
+;;;;;
+; mbin_dispatch_init parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+;;;;;
+%macro mbin_dispatch_init 4
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+ mov eax, 1
+ cpuid
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+ jne _%1_init_done ; AVX is not available so end
+ mov mbin_rsi, mbin_rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _%1_init_done
+ lea mbin_rsi, [%2 WRT_OPT]
+
+ _%1_init_done:
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init2 parameters
+; Cases where only base functions are available
+; 1-> function name
+; 2-> base function
+;;;;;
+%macro mbin_dispatch_init2 2
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init_clmul 3 parameters
+; Use this case for CRC which needs both SSE4_1 and CLMUL
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 and CLMUL optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX512/10 opt func
+;;;;;
+%macro mbin_dispatch_init_clmul 5
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ jz _%1_init_done
+ test ecx, FLAG_CPUID1_ECX_CLMUL
+ jz _%1_init_done
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+%if AS_FEATURE_LEVEL >= 10
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ jne _%1_init_done
+
+ and ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ lea mbin_rbx, [%5 WRT_OPT] ; AVX512/10 opt
+ cmove mbin_rsi, mbin_rbx
+%endif
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init5 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+;;;;;
+%macro mbin_dispatch_init5 5
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ ; Test for SSE4.2
+ test ecx, FLAG_CPUID1_ECX_SSE4_2
+ lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
+ jne _%1_init_done ; AVX is not available so end
+ mov mbin_rsi, mbin_rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _%1_init_done
+ lea mbin_rsi, [%3 WRT_OPT]
+
+ _%1_init_done:
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+%if AS_FEATURE_LEVEL >= 6
+;;;;;
+; mbin_dispatch_init6 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+;;;;;
+%macro mbin_dispatch_init6 6
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_2
+ je _%1_init_done ; Use base function if no SSE4_2
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+ lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+%else
+%macro mbin_dispatch_init6 6
+ mbin_dispatch_init5 %1, %2, %3, %4, %5
+%endmacro
+%endif
+
+%if AS_FEATURE_LEVEL >= 10
+;;;;;
+; mbin_dispatch_init7 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> AVX512 Update/10 opt func
+;;;;;
+%macro mbin_dispatch_init7 7
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_2
+ je _%1_init_done ; Use base function if no SSE4_2
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+ lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ and ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+%else
+%macro mbin_dispatch_init7 7
+ mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
+%endmacro
+%endif
+
+%endif ; ifndef _MULTIBINARY_ASM_
diff --git a/src/isa-l/include/raid.h b/src/isa-l/include/raid.h
new file mode 100644
index 000000000..6100a4824
--- /dev/null
+++ b/src/isa-l/include/raid.h
@@ -0,0 +1,305 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _RAID_H_
+#define _RAID_H_
+
+/**
+ * @file raid.h
+ * @brief Interface to RAID functions - XOR and P+Q calculation.
+ *
+ * This file defines the interface to optimized XOR calculation (RAID5) or P+Q
+ * dual parity (RAID6). Operations are carried out on an array of pointers to
+ * sources and output arrays.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Multi-binary functions */
+
+/**
+ * @brief Generate XOR parity vector from N sources, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects Number of source+dest vectors in array.
+ * @param len Length of each vector in bytes.
+ * @param array Array of pointers to source and dest. For XOR the dest is
+ * the last pointer. ie array[vects-1]. Src and dest
+ * pointers must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array has XOR parity sum of 0 across all vectors, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects Number of vectors in array.
+ * @param len Length of each vector in bytes.
+ * @param array Array of pointers to vectors. Src and dest pointers
+ * must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_check(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects Number of source+dest vectors in array.
+ * @param len Length of each vector in bytes. Must be 32B aligned.
+ * @param array Array of pointers to source and dest. For P+Q the dest
+ * is the last two pointers. ie array[vects-2],
+ * array[vects-1]. P and Q parity vectors are
+ * written to these last two pointers. Src and dest
+ * pointers must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects Number of vectors in array including P&Q.
+ * @param len Length of each vector in bytes. Must be 16B aligned.
+ * @param array Array of pointers to source and P, Q. P and Q parity
+ * are assumed to be the last two pointers in the array.
+ * All pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_check(int vects, int len, void **array);
+
+
+/* Arch specific versions */
+// x86 only
+#if defined(__i386__) || defined(__x86_64__)
+
+/**
+ * @brief Generate XOR parity vector from N sources.
+ * @requires SSE4.1
+ *
+ * @param vects Number of source+dest vectors in array.
+ * @param len Length of each vector in bytes.
+ * @param array Array of pointers to source and dest. For XOR the dest is
+ * the last pointer. ie array[vects-1]. Src and dest pointers
+ * must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate XOR parity vector from N sources.
+ * @requires AVX
+ *
+ * @param vects Number of source+dest vectors in array.
+ * @param len Length of each vector in bytes.
+ * @param array Array of pointers to source and dest. For XOR the dest is
+ * the last pointer. ie array[vects-1]. Src and dest pointers
+ * must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen_avx(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array has XOR parity sum of 0 across all vectors.
+ * @requires SSE4.1
+ *
+ * @param vects Number of vectors in array.
+ * @param len Length of each vector in bytes.
+ * @param array Array of pointers to vectors. Src and dest pointers
+ * must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_check_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources.
+ * @requires SSE4.1
+ *
+ * @param vects Number of source+dest vectors in array.
+ * @param len Length of each vector in bytes. Must be 16B aligned.
+ * @param array Array of pointers to source and dest. For P+Q the dest
+ * is the last two pointers. ie array[vects-2],
+ * array[vects-1]. P and Q parity vectors are
+ * written to these last two pointers. Src and dest
+ * pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources.
+ * @requires AVX
+ *
+ * @param vects Number of source+dest vectors in array.
+ * @param len Length of each vector in bytes. Must be 16B aligned.
+ * @param array Array of pointers to source and dest. For P+Q the dest
+ * is the last two pointers. ie array[vects-2],
+ * array[vects-1]. P and Q parity vectors are
+ * written to these last two pointers. Src and dest
+ * pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_avx(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources.
+ * @requires AVX2
+ *
+ * @param vects Number of source+dest vectors in array.
+ * @param len Length of each vector in bytes. Must be 32B aligned.
+ * @param array Array of pointers to source and dest. For P+Q the dest
+ * is the last two pointers. ie array[vects-2],
+ * array[vects-1]. P and Q parity vectors are
+ * written to these last two pointers. Src and dest
+ * pointers must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_avx2(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array of N sources, P and Q are consistent across all vectors.
+ * @requires SSE4.1
+ *
+ * @param vects Number of vectors in array including P&Q.
+ * @param len Length of each vector in bytes. Must be 16B aligned.
+ * @param array Array of pointers to source and P, Q. P and Q parity
+ are assumed to be the last two pointers in the array.
+ All pointers must be aligned to 16B.
+ * @returns 0 pass, other fail
+ */
+
+int pq_check_sse(int vects, int len, void **array);
+
+#endif
+
+/**
+ * @brief Generate P+Q parity vectors from N sources, runs baseline version.
+ * @param vects Number of source+dest vectors in array.
+ * @param len Length of each vector in bytes. Must be 16B aligned.
+ * @param array Array of pointers to source and dest. For P+Q the dest
+ * is the last two pointers. ie array[vects-2],
+ * array[vects-1]. P and Q parity vectors are
+ * written to these last two pointers. Src and dest pointers
+ * must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_base(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate XOR parity vector from N sources, runs baseline version.
+ * @param vects Number of source+dest vectors in array.
+ * @param len Length of each vector in bytes.
+ * @param array Array of pointers to source and dest. For XOR the dest is
+ * the last pointer. ie array[vects-1]. Src and dest pointers
+ * must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen_base(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array has XOR parity sum of 0 across all vectors, runs baseline version.
+ *
+ * @param vects Number of vectors in array.
+ * @param len Length of each vector in bytes.
+ * @param array Array of pointers to vectors. Src and dest pointers
+ * must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_check_base(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs baseline version.
+ *
+ * @param vects Number of vectors in array including P&Q.
+ * @param len Length of each vector in bytes. Must be 16B aligned.
+ * @param array Array of pointers to source and P, Q. P and Q parity
+ * are assumed to be the last two pointers in the array.
+ * All pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_check_base(int vects, int len, void **array);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_RAID_H_
diff --git a/src/isa-l/include/reg_sizes.asm b/src/isa-l/include/reg_sizes.asm
new file mode 100644
index 000000000..b7ad842d8
--- /dev/null
+++ b/src/isa-l/include/reg_sizes.asm
@@ -0,0 +1,291 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
+%ifndef AS_FEATURE_LEVEL
+%define AS_FEATURE_LEVEL 4
+%endif
+
+%define EFLAGS_HAS_CPUID (1<<21)
+%define FLAG_CPUID1_ECX_CLMUL (1<<1)
+%define FLAG_CPUID1_EDX_SSE2 (1<<26)
+%define FLAG_CPUID1_ECX_SSE3 (1)
+%define FLAG_CPUID1_ECX_SSE4_1 (1<<19)
+%define FLAG_CPUID1_ECX_SSE4_2 (1<<20)
+%define FLAG_CPUID1_ECX_POPCNT (1<<23)
+%define FLAG_CPUID1_ECX_AESNI (1<<25)
+%define FLAG_CPUID1_ECX_OSXSAVE (1<<27)
+%define FLAG_CPUID1_ECX_AVX (1<<28)
+%define FLAG_CPUID1_EBX_AVX2 (1<<5)
+
+%define FLAG_CPUID7_EBX_AVX2 (1<<5)
+%define FLAG_CPUID7_EBX_AVX512F (1<<16)
+%define FLAG_CPUID7_EBX_AVX512DQ (1<<17)
+%define FLAG_CPUID7_EBX_AVX512IFMA (1<<21)
+%define FLAG_CPUID7_EBX_AVX512PF (1<<26)
+%define FLAG_CPUID7_EBX_AVX512ER (1<<27)
+%define FLAG_CPUID7_EBX_AVX512CD (1<<28)
+%define FLAG_CPUID7_EBX_AVX512BW (1<<30)
+%define FLAG_CPUID7_EBX_AVX512VL (1<<31)
+
+%define FLAG_CPUID7_ECX_AVX512VBMI (1<<1)
+%define FLAG_CPUID7_ECX_AVX512VBMI2 (1 << 6)
+%define FLAG_CPUID7_ECX_GFNI (1 << 8)
+%define FLAG_CPUID7_ECX_VAES (1 << 9)
+%define FLAG_CPUID7_ECX_VPCLMULQDQ (1 << 10)
+%define FLAG_CPUID7_ECX_VNNI (1 << 11)
+%define FLAG_CPUID7_ECX_BITALG (1 << 12)
+%define FLAG_CPUID7_ECX_VPOPCNTDQ (1 << 14)
+
+%define FLAGS_CPUID7_EBX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ)
+%define FLAGS_CPUID7_ECX_AVX512_G2 (FLAG_CPUID7_ECX_AVX512VBMI2 | FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ | FLAG_CPUID7_ECX_VNNI | FLAG_CPUID7_ECX_BITALG | FLAG_CPUID7_ECX_VPOPCNTDQ)
+
+%define FLAG_XGETBV_EAX_XMM (1<<1)
+%define FLAG_XGETBV_EAX_YMM (1<<2)
+%define FLAG_XGETBV_EAX_XMM_YMM 0x6
+%define FLAG_XGETBV_EAX_ZMM_OPM 0xe0
+
+%define FLAG_CPUID1_EAX_AVOTON 0x000406d0
+%define FLAG_CPUID1_EAX_STEP_MASK 0xfffffff0
+
+; define d and w variants for registers
+
+%define raxd eax
+%define raxw ax
+%define raxb al
+
+%define rbxd ebx
+%define rbxw bx
+%define rbxb bl
+
+%define rcxd ecx
+%define rcxw cx
+%define rcxb cl
+
+%define rdxd edx
+%define rdxw dx
+%define rdxb dl
+
+%define rsid esi
+%define rsiw si
+%define rsib sil
+
+%define rdid edi
+%define rdiw di
+%define rdib dil
+
+%define rbpd ebp
+%define rbpw bp
+%define rbpb bpl
+
+%define ymm0x xmm0
+%define ymm1x xmm1
+%define ymm2x xmm2
+%define ymm3x xmm3
+%define ymm4x xmm4
+%define ymm5x xmm5
+%define ymm6x xmm6
+%define ymm7x xmm7
+%define ymm8x xmm8
+%define ymm9x xmm9
+%define ymm10x xmm10
+%define ymm11x xmm11
+%define ymm12x xmm12
+%define ymm13x xmm13
+%define ymm14x xmm14
+%define ymm15x xmm15
+
+%define zmm0x xmm0
+%define zmm1x xmm1
+%define zmm2x xmm2
+%define zmm3x xmm3
+%define zmm4x xmm4
+%define zmm5x xmm5
+%define zmm6x xmm6
+%define zmm7x xmm7
+%define zmm8x xmm8
+%define zmm9x xmm9
+%define zmm10x xmm10
+%define zmm11x xmm11
+%define zmm12x xmm12
+%define zmm13x xmm13
+%define zmm14x xmm14
+%define zmm15x xmm15
+%define zmm16x xmm16
+%define zmm17x xmm17
+%define zmm18x xmm18
+%define zmm19x xmm19
+%define zmm20x xmm20
+%define zmm21x xmm21
+%define zmm22x xmm22
+%define zmm23x xmm23
+%define zmm24x xmm24
+%define zmm25x xmm25
+%define zmm26x xmm26
+%define zmm27x xmm27
+%define zmm28x xmm28
+%define zmm29x xmm29
+%define zmm30x xmm30
+%define zmm31x xmm31
+
+%define zmm0y ymm0
+%define zmm1y ymm1
+%define zmm2y ymm2
+%define zmm3y ymm3
+%define zmm4y ymm4
+%define zmm5y ymm5
+%define zmm6y ymm6
+%define zmm7y ymm7
+%define zmm8y ymm8
+%define zmm9y ymm9
+%define zmm10y ymm10
+%define zmm11y ymm11
+%define zmm12y ymm12
+%define zmm13y ymm13
+%define zmm14y ymm14
+%define zmm15y ymm15
+%define zmm16y ymm16
+%define zmm17y ymm17
+%define zmm18y ymm18
+%define zmm19y ymm19
+%define zmm20y ymm20
+%define zmm21y ymm21
+%define zmm22y ymm22
+%define zmm23y ymm23
+%define zmm24y ymm24
+%define zmm25y ymm25
+%define zmm26y ymm26
+%define zmm27y ymm27
+%define zmm28y ymm28
+%define zmm29y ymm29
+%define zmm30y ymm30
+%define zmm31y ymm31
+
+%define DWORD(reg) reg %+ d
+%define WORD(reg) reg %+ w
+%define BYTE(reg) reg %+ b
+
+%define XWORD(reg) reg %+ x
+
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+ %define __x86_64__
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,win64
+ %define __x86_64__
+%endif
+%ifidn __OUTPUT_FORMAT__,macho64
+ %define __x86_64__
+%endif
+
+%ifdef __x86_64__
+ %define endbranch db 0xf3, 0x0f, 0x1e, 0xfa
+%else
+ %define endbranch db 0xf3, 0x0f, 0x1e, 0xfb
+%endif
+
+%ifdef REL_TEXT
+ %define WRT_OPT
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define WRT_OPT wrt ..plt
+%else
+ %define WRT_OPT
+%endif
+
+%macro mk_global 1-3
+ %ifdef __NASM_VER__
+ %ifidn __OUTPUT_FORMAT__, macho64
+ global %1
+ %elifidn __OUTPUT_FORMAT__, win64
+ global %1
+ %else
+ global %1:%2 %3
+ %endif
+ %else
+ global %1:%2 %3
+ %endif
+%endmacro
+
+
+; Fixes for nasm lack of MS proc helpers
+%ifdef __NASM_VER__
+ %ifidn __OUTPUT_FORMAT__, win64
+ %macro alloc_stack 1
+ sub rsp, %1
+ %endmacro
+
+ %macro proc_frame 1
+ %1:
+ %endmacro
+
+ %macro save_xmm128 2
+ movdqa [rsp + %2], %1
+ %endmacro
+
+ %macro save_reg 2
+ mov [rsp + %2], %1
+ %endmacro
+
+ %macro rex_push_reg 1
+ push %1
+ %endmacro
+
+ %macro push_reg 1
+ push %1
+ %endmacro
+
+ %define end_prolog
+ %endif
+
+ %define endproc_frame
+%endif
+
+%ifidn __OUTPUT_FORMAT__, macho64
+ %define elf64 macho64
+ mac_equ equ 1
+%endif
+
+%macro slversion 4
+ section .text
+ global %1_slver_%2%3%4
+ global %1_slver
+ %1_slver:
+ %1_slver_%2%3%4:
+ dw 0x%4
+ db 0x%3, 0x%2
+%endmacro
+
+%endif ; ifndef _REG_SIZES_ASM_
diff --git a/src/isa-l/include/test.h b/src/isa-l/include/test.h
new file mode 100644
index 000000000..31ccc67b9
--- /dev/null
+++ b/src/isa-l/include/test.h
@@ -0,0 +1,285 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _TEST_H
+#define _TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+/* Decide wether to use benchmark time as an approximation or a minimum. Fewer
+ * calls to the timer are required for the approximation case.*/
+#define BENCHMARK_MIN_TIME 0
+#define BENCHMARK_APPROX_TIME 1
+#ifndef BENCHMARK_TYPE
+#define BENCHMARK_TYPE BENCHMARK_MIN_TIME
+#endif
+
+#ifdef USE_RDTSC
+/* The use of rtdsc is nuanced. On many processors it corresponds to a
+ * standardized clock source. To obtain a meaningful result it may be
+ * necessary to fix the CPU clock to match the rtdsc tick rate.
+ */
+# include <inttypes.h>
+# include <x86intrin.h>
+# define USE_CYCLES
+#else
+# include <time.h>
+#define USE_SECONDS
+#endif
+
+#ifdef USE_RDTSC
+#ifndef BENCHMARK_TIME
+# define BENCHMARK_TIME 6
+#endif
+# define GHZ 1000000000
+# define UNIT_SCALE (GHZ)
+# define CALLIBRATE_TIME (UNIT_SCALE / 2)
+static inline long long get_time(void) {
+ unsigned int dummy;
+ return __rdtscp(&dummy);
+}
+
+static inline long long get_res(void) {
+ return 1;
+}
+#else
+#ifndef BENCHMARK_TIME
+# define BENCHMARK_TIME 3
+#endif
+#ifdef _MSC_VER
+#define UNIT_SCALE get_res()
+#define CALLIBRATE_TIME (UNIT_SCALE / 4)
+static inline long long get_time(void) {
+ long long ret = 0;
+ QueryPerformanceCounter(&ret);
+ return ret;
+}
+
+static inline long long get_res(void) {
+ long long ret = 0;
+ QueryPerformanceFrequency(&ret);
+ return ret;
+}
+#else
+# define NANO_SCALE 1000000000
+# define UNIT_SCALE NANO_SCALE
+# define CALLIBRATE_TIME (UNIT_SCALE / 4)
+#ifdef __FreeBSD__
+# define CLOCK_ID CLOCK_MONOTONIC_PRECISE
+#else
+# define CLOCK_ID CLOCK_MONOTONIC
+#endif
+
+static inline long long get_time(void) {
+ struct timespec time;
+ long long nano_total;
+ clock_gettime(CLOCK_ID, &time);
+ nano_total = time.tv_sec;
+ nano_total *= NANO_SCALE;
+ nano_total += time.tv_nsec;
+ return nano_total;
+}
+
+static inline long long get_res(void) {
+ struct timespec time;
+ long long nano_total;
+ clock_getres(CLOCK_ID, &time);
+ nano_total = time.tv_sec;
+ nano_total *= NANO_SCALE;
+ nano_total += time.tv_nsec;
+ return nano_total;
+}
+#endif
+#endif
+struct perf {
+ long long start;
+ long long stop;
+ long long run_total;
+ long long iterations;
+};
+
+static inline void perf_init(struct perf *p) {
+ p->start = 0;
+ p->stop = 0;
+ p->run_total = 0;
+}
+
+static inline void perf_continue(struct perf *p) {
+ p->start = get_time();
+}
+
+static inline void perf_pause(struct perf *p) {
+ p->stop = get_time();
+ p->run_total = p->run_total + p->stop - p->start;
+ p->start = p->stop;
+}
+
+static inline void perf_start(struct perf *p) {
+ perf_init(p);
+ perf_continue(p);
+}
+
+static inline void perf_stop(struct perf *p) {
+ perf_pause(p);
+}
+
+static inline double get_time_elapsed(struct perf *p) {
+ return 1.0 * p->run_total / UNIT_SCALE;
+}
+
+static inline long long get_base_elapsed(struct perf *p) {
+ return p->run_total;
+}
+
+static inline unsigned long long estimate_perf_iterations(struct perf *p,
+ unsigned long long runs,
+ unsigned long long total) {
+ total = total * runs;
+ if (get_base_elapsed(p) > 0)
+ return (total + get_base_elapsed(p) - 1) / get_base_elapsed(p);
+ else
+ return (total + get_res() - 1) / get_res();
+}
+
+#define CALLIBRATE(PERF, FUNC_CALL) { \
+ unsigned long long _i, _iter = 1; \
+ perf_start(PERF); \
+ FUNC_CALL; \
+ perf_pause(PERF); \
+ \
+ while (get_base_elapsed(PERF) < CALLIBRATE_TIME) { \
+ _iter = estimate_perf_iterations(PERF, _iter, \
+ 2 * CALLIBRATE_TIME); \
+ perf_start(PERF); \
+ for (_i = 0; _i < _iter; _i++) { \
+ FUNC_CALL; \
+ } \
+ perf_stop(PERF); \
+ } \
+ (PERF)->iterations=_iter; \
+}
+
+#define PERFORMANCE_TEST(PERF, RUN_TIME, FUNC_CALL) { \
+ unsigned long long _i, _iter = (PERF)->iterations; \
+ unsigned long long _run_total = RUN_TIME; \
+ _run_total *= UNIT_SCALE; \
+ _iter = estimate_perf_iterations(PERF, _iter, _run_total);\
+ (PERF)->iterations = 0; \
+ perf_start(PERF); \
+ for (_i = 0; _i < _iter; _i++) { \
+ FUNC_CALL; \
+ } \
+ perf_pause(PERF); \
+ (PERF)->iterations += _iter; \
+ \
+ if(get_base_elapsed(PERF) < _run_total && \
+ BENCHMARK_TYPE == BENCHMARK_MIN_TIME) { \
+ _iter = estimate_perf_iterations(PERF, _iter, \
+ _run_total - get_base_elapsed(PERF) + \
+ (UNIT_SCALE / 16)); \
+ perf_continue(PERF); \
+ for (_i = 0; _i < _iter; _i++) { \
+ FUNC_CALL; \
+ } \
+ perf_pause(PERF); \
+ (PERF)->iterations += _iter; \
+ } \
+}
+
+#define BENCHMARK(PERF, RUN_TIME, FUNC_CALL) { \
+ if((RUN_TIME) > 0) { \
+ CALLIBRATE(PERF, FUNC_CALL); \
+ PERFORMANCE_TEST(PERF, RUN_TIME, FUNC_CALL); \
+ \
+ } else { \
+ (PERF)->iterations = 1; \
+ perf_start(PERF); \
+ FUNC_CALL; \
+ perf_stop(PERF); \
+ } \
+}
+
+#ifdef USE_CYCLES
+static inline void perf_print(struct perf p, long long unit_count) {
+ long long total_units = p.iterations * unit_count;
+
+ printf("runtime = %10lld ticks", get_base_elapsed(&p));
+ if (total_units != 0) {
+ printf(", bandwidth %lld MB in %.4f GC = %.2f ticks/byte",
+ total_units / (1000000), get_time_elapsed(&p),
+ get_base_elapsed(&p) / (double)total_units);
+ }
+ printf("\n");
+}
+#else
+static inline void perf_print(struct perf p, double unit_count) {
+ long long total_units = p.iterations * unit_count;
+ long long usecs = (long long)(get_time_elapsed(&p) * 1000000);
+
+ printf("runtime = %10lld usecs", usecs);
+ if (total_units != 0) {
+ printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s",
+ total_units / (1000000), get_time_elapsed(&p),
+ ((double)total_units) / (1000000 * get_time_elapsed(&p)));
+ }
+ printf("\n");
+}
+#endif
+
+static inline uint64_t get_filesize(FILE * fp) {
+ uint64_t file_size;
+ fpos_t pos, pos_curr;
+
+ fgetpos(fp, &pos_curr); /* Save current position */
+#if defined(_WIN32) || defined(_WIN64)
+ _fseeki64(fp, 0, SEEK_END);
+#else
+ fseeko(fp, 0, SEEK_END);
+#endif
+ fgetpos(fp, &pos);
+ file_size = *(uint64_t *) & pos;
+ fsetpos(fp, &pos_curr); /* Restore position */
+
+ return file_size;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _TEST_H
diff --git a/src/isa-l/include/types.h b/src/isa-l/include/types.h
new file mode 100644
index 000000000..531c79724
--- /dev/null
+++ b/src/isa-l/include/types.h
@@ -0,0 +1,77 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ * @file types.h
+ * @brief Defines standard width types.
+ *
+ */
+
+#ifndef __TYPES_H
+#define __TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+#ifdef __MINGW32__
+# include <_mingw.h>
+#endif
+#endif
+
+
+#if defined __unix__ || defined __APPLE__
+# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+# define __forceinline static inline
+# define aligned_free(x) free(x)
+#else
+# ifdef __MINGW32__
+# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+# define aligned_free(x) _aligned_free(x)
+# else
+# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
+# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+# define aligned_free(x) _aligned_free(x)
+# endif
+#endif
+
+#ifdef DEBUG
+# define DEBUG_PRINT(x) printf x
+#else
+# define DEBUG_PRINT(x) do {} while (0)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__TYPES_H
diff --git a/src/isa-l/include/unaligned.h b/src/isa-l/include/unaligned.h
new file mode 100644
index 000000000..f7b1ed88e
--- /dev/null
+++ b/src/isa-l/include/unaligned.h
@@ -0,0 +1,76 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef UNALIGNED_H
+#define UNALIGNED_H
+
+#include "stdint.h"
+#include "string.h"
+
+static inline uint16_t load_u16(uint8_t * buf) {
+ uint16_t ret;
+ memcpy(&ret, buf, sizeof(ret));
+ return ret;
+}
+
+static inline uint32_t load_u32(uint8_t * buf) {
+ uint32_t ret;
+ memcpy(&ret, buf, sizeof(ret));
+ return ret;
+}
+
+static inline uint64_t load_u64(uint8_t * buf) {
+ uint64_t ret;
+ memcpy(&ret, buf, sizeof(ret));
+ return ret;
+}
+
+static inline uintmax_t load_umax(uint8_t * buf) {
+ uintmax_t ret;
+ memcpy(&ret, buf, sizeof(ret));
+ return ret;
+}
+
+static inline void store_u16(uint8_t * buf, uint16_t val) {
+ memcpy(buf, &val, sizeof(val));
+}
+
+static inline void store_u32(uint8_t * buf, uint32_t val) {
+ memcpy(buf, &val, sizeof(val));
+}
+
+static inline void store_u64(uint8_t * buf, uint64_t val) {
+ memcpy(buf, &val, sizeof(val));
+}
+
+static inline void store_umax(uint8_t * buf, uintmax_t val) {
+ memcpy(buf, &val, sizeof(val));
+}
+
+#endif