10 files changed, 3055 insertions, 0 deletions
diff --git a/src/isa-l/include/crc.h b/src/isa-l/include/crc.h
new file mode 100644
index 00000000..5d6b0495
--- /dev/null
+++ b/src/isa-l/include/crc.h
@@ -0,0 +1,134 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ *  @file  crc.h
+ *  @brief CRC functions.
+ */
+
+
+#ifndef _CRC_H_
+#define _CRC_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Multi-binary functions */
+
+/**
+ * @brief Generate CRC from the T10 standard, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @returns 16 bit CRC
+ */
+uint16_t crc16_t10dif(
+	uint16_t init_crc,        //!< initial CRC value, 16 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+
+/**
+ * @brief Generate CRC from the IEEE standard, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @returns 32 bit CRC
+ */
+
+uint32_t crc32_ieee(
+	uint32_t init_crc,        //!< initial CRC value, 32 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+
+/**
+ * @brief ISCSI CRC function, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @returns 32 bit CRC
+ */
+unsigned int crc32_iscsi(
+	unsigned char *buffer, //!< buffer to calculate CRC on
+	int len,               //!< buffer length in bytes
+	unsigned int init_crc  //!< initial CRC value
+	);
+
+
+/* Base functions */
+
+/**
+ * @brief ISCSI CRC function, baseline version
+ * @returns 32 bit CRC
+ */
+unsigned int crc32_iscsi_base(
+	unsigned char *buffer,	//!< buffer to calculate CRC on
+	int len, 		//!< buffer length in bytes
+	unsigned int crc_init	//!< initial CRC value
+	);
+
+
+/**
+ * @brief Generate CRC from the T10 standard, runs baseline version
+ * @returns 16 bit CRC
+ */
+uint16_t crc16_t10dif_base(
+	uint16_t seed,	//!< initial CRC value, 16 bits
+	uint8_t *buf,	//!< buffer to calculate CRC on
+	uint64_t len 	//!< buffer length in bytes (64-bit data)
+	);
+
+
+/**
+ * @brief Generate CRC from the IEEE standard, runs baseline version
+ * @returns 32 bit CRC
+ */
+uint32_t crc32_ieee_base(
+	uint32_t seed, 	//!< initial CRC value, 32 bits
+	uint8_t *buf,	//!< buffer to calculate CRC on
+	uint64_t len 	//!< buffer length in bytes (64-bit data)
+	);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CRC_H_
diff --git a/src/isa-l/include/crc64.h b/src/isa-l/include/crc64.h
new file mode 100644
index 00000000..8d7d81f9
--- /dev/null
+++ b/src/isa-l/include/crc64.h
@@ -0,0 +1,277 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ *  @file  crc64.h
+ *  @brief CRC64 functions.
+ */
+
+
+#ifndef _CRC64_H_
+#define _CRC64_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Multi-binary functions */
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in reflected format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_ecma_refl(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in normal format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_ecma_norm(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ISO standard in reflected format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_iso_refl(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ISO standard in normal format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_iso_norm(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in reflected format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_jones_refl(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in normal format, runs
+ * appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_jones_norm(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/* Arch specific versions */
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in reflected format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_ecma_refl_by8(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in normal format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_ecma_norm_by8(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in reflected format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_ecma_refl_base(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ECMA-182 standard in normal format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_ecma_norm_base(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ISO standard in reflected format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_iso_refl_by8(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ISO standard in normal format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_iso_norm_by8(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ISO standard in reflected format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_iso_refl_base(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from ISO standard in normal format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_iso_norm_base(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in reflected format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_jones_refl_by8(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in normal format.
+ * @requires SSE3, CLMUL
+ *
+ * @returns 64 bit CRC
+ */
+
+uint64_t crc64_jones_norm_by8(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in reflected format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_jones_refl_base(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+/**
+ * @brief Generate CRC from "Jones" coefficients in normal format, runs baseline version
+ * @returns 64 bit CRC
+ */
+uint64_t crc64_jones_norm_base(
+	uint64_t init_crc,        //!< initial CRC value, 64 bits
+	const unsigned char *buf, //!< buffer to calculate CRC on
+	uint64_t len              //!< buffer length in bytes (64-bit data)
+	);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CRC64_H_
diff --git a/src/isa-l/include/erasure_code.h b/src/isa-l/include/erasure_code.h
new file mode 100644
index 00000000..570944c3
--- /dev/null
+++ b/src/isa-l/include/erasure_code.h
@@ -0,0 +1,933 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _ERASURE_CODE_H_
+#define _ERASURE_CODE_H_
+
+/**
+ *  @file erasure_code.h
+ *  @brief Interface to functions supporting erasure code encode and decode.
+ *
+ *  This file defines the interface to optimized functions used in erasure
+ *  codes.  Encode and decode of erasures in GF(2^8) are made by calculating the
+ *  dot product of the symbols (bytes in GF(2^8)) across a set of buffers and a
+ *  set of coefficients.  Values for the coefficients are determined by the type
+ *  of erasure code.  Using a general dot product means that any sequence of
+ *  coefficients may be used including erasure codes based on random 
+ *  coefficients.
+ *  Multiple versions of dot product are supplied to calculate 1-6 output
+ *  vectors in one pass.
+ *  Base GF multiply and divide functions can be sped up by defining
+ *  GF_LARGE_TABLES at the expense of memory size.
+ *
+ */
+
+#include "gf_vect_mul.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Initialize tables for fast Erasure Code encode and decode.
+ *
+ * Generates the expanded tables needed for fast encode or decode for erasure
+ * codes on blocks of data.  32bytes is generated for each input coefficient.
+ *
+ * @param k      The number of vector sources or rows in the generator matrix
+ *               for coding.
+ * @param rows   The number of output vectors to concurrently encode/decode.
+ * @param a      Pointer to sets of arrays of input coefficients used to encode
+ *               or decode data.
+ * @param gftbls Pointer to start of space for concatenated output tables
+ *               generated from input coefficients.  Must be of size 32*k*rows.
+ * @returns none
+ */
+
+void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
+ *
+ * Given a list of source data blocks, generate one or multiple blocks of
+ * encoded data as specified by a matrix of GF(2^8) coefficients. When given a
+ * suitable set of coefficients, this function will perform the fast generation
+ * or decoding of Reed-Solomon type erasure codes.
+ * 
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * 
+ * @param len    Length of each block of data (vector) of source or dest data.
+ * @param k      The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param rows   The number of output vectors to concurrently encode/decode.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
+ * @param data   Array of pointers to source input buffers.
+ * @param coding Array of pointers to coded output buffers.
+ * @returns none
+ */
+
+void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+		    unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires SSE4.1
+ */
+void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX
+ */
+void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX2
+ */
+void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			 unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data, runs baseline version.
+ *
+ * Baseline version of ec_encode_data() with same parameters.
+ */
+void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src,
+			 unsigned char **dest);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version.
+ *
+ * Given one source data block, update one or multiple blocks of encoded data as
+ * specified by a matrix of GF(2^8) coefficients. When given a suitable set of
+ * coefficients, this function will perform the fast generation or decoding of
+ * Reed-Solomon type erasure codes from one input source at a time.
+ * 
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ * 
+ * @param len    Length of each block of data (vector) of source or dest data.
+ * @param k      The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param rows   The number of output vectors to concurrently encode/decode.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param g_tbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
+ * @param data   Pointer to single input source used to update output parity.
+ * @param coding Array of pointers to coded output buffers.
+ * @returns none
+ */
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			   unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires SSE4.1
+ */
+
+void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX
+ */
+
+void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX2
+ */
+
+void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+				unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Baseline version of ec_encode_data_update().
+ */
+
+void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v, 
+				unsigned char *data, unsigned char **dest);
+
+
+/**
+ * @brief GF(2^8) vector dot product.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, 
+			unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, 
+			unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, 
+			unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs.
+ *
+ * Vector dot product optimized to calculate two ouputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs.
+ *
+ * Vector dot product optimized to calculate two ouputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs.
+ *
+ * Vector dot product optimized to calculate two ouputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs.
+ *
+ * Vector dot product optimized to calculate three ouputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs.
+ *
+ * Vector dot product optimized to calculate three ouputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs.
+ *
+ * Vector dot product optimized to calculate three ouputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs.
+ *
+ * Vector dot product optimized to calculate four ouputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs.
+ *
+ * Vector dot product optimized to calculate four ouputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs.
+ *
+ * Vector dot product optimized to calculate four ouputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs.
+ *
+ * Vector dot product optimized to calculate five ouputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs.
+ *
+ * Vector dot product optimized to calculate five ouputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs.
+ *
+ * Vector dot product optimized to calculate five ouputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs.
+ *
+ * Vector dot product optimized to calculate six ouputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs.
+ *
+ * Vector dot product optimized to calculate six ouputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs.
+ *
+ * Vector dot product optimized to calculate six ouputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product, runs baseline version.
+ * 
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * 
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients. Only elements 32*CONST*j + 1 
+ *               of this array are used, where j = (0, 1, 2...) and CONST is the
+ *               number of elements in the array of input coefficients. The 
+ *               elements used correspond to the original input coefficients.		
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls,
+                        unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product, runs appropriate version.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * 
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls,
+                        unsigned char **src, unsigned char *dest);
+
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, runs appropriate version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		 unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires SSE4.1
+ */
+
+void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX
+ */
+
+void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX2
+ */
+
+void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, baseline version.
+ *
+ * Baseline version of gf_vect_mad() with same parameters.
+ */
+
+void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate.  SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse().
+ * @requires AVX
+ */
+void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse().
+ * @requires AVX
+ */
+void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse().
+ * @requires AVX
+ */
+void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+
+/**********************************************************************
+ * The remaining are lib support functions used in GF(2^8) operations.
+ */
+
+/**
+ * @brief Single element GF(2^8) multiply.
+ *
+ * @param a  Multiplicand a
+ * @param b  Multiplicand b
+ * @returns  Product of a and b in GF(2^8)
+ */
+
+unsigned char gf_mul(unsigned char a, unsigned char b);
+
+/**
+ * @brief Single element GF(2^8) inverse.
+ *
+ * @param a  Input element
+ * @returns  Field element b such that a x b = {1}
+ */
+
+unsigned char gf_inv(unsigned char a);
+
+/**
+ * @brief Generate a matrix of coefficients to be used for encoding.
+ *
+ * Vandermonde matrix example of encoding coefficients where high portion of
+ * matrix is identity matrix I and lower portion is constructed as 2^{i*(j-k+1)}
+ * i:{0,k-1} j:{k,m-1}. Commonly used method for choosing coefficients in
+ * erasure encoding but does not guarantee invertable for every sub matrix.  For
+ * large k it is possible to find cases where the decode matrix chosen from
+ * sources and parity not in erasure are not invertable. Users may want to
+ * adjust for k > 5.
+ *
+ * @param a  [mxk] array to hold coefficients
+ * @param m  number of rows in matrix corresponding to srcs + parity.
+ * @param k  number of columns in matrix corresponding to srcs.
+ * @returns  none
+ */
+
+void gf_gen_rs_matrix(unsigned char *a, int m, int k);
+
+/**
+ * @brief Generate a Cauchy matrix of coefficients to be used for encoding.
+ *
+ * Cauchy matrix example of encoding coefficients where high portion of matrix
+ * is identity matrix I and lower portion is constructed as 1/(i + j) | i != j,
+ * i:{0,k-1} j:{k,m-1}.  Any sub-matrix of a Cauchy matrix should be invertable.
+ *
+ * @param a  [mxk] array to hold coefficients
+ * @param m  number of rows in matrix corresponding to srcs + parity.
+ * @param k  number of columns in matrix corresponding to srcs.
+ * @returns  none
+ */
+
+void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k);
+
+/**
+ * @brief Invert a matrix in GF(2^8)
+ *
+ * @param in  input matrix
+ * @param out output matrix such that [in] x [out] = [I] - identity matrix
+ * @param n   size of matrix [nxn]
+ * @returns 0 successful, other fail on singular input matrix
+ */
+
+int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n);
+
+
+/*************************************************************/
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ERASURE_CODE_H_
diff --git a/src/isa-l/include/gf_vect_mul.h b/src/isa-l/include/gf_vect_mul.h
new file mode 100644
index 00000000..42aa0a45
--- /dev/null
+++ b/src/isa-l/include/gf_vect_mul.h
@@ -0,0 +1,148 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _GF_VECT_MUL_H
+#define _GF_VECT_MUL_H
+
+/**
+ *  @file gf_vect_mul.h
+ *  @brief Interface to functions for vector (block) multiplication in GF(2^8).
+ *
+ *  This file defines the interface to routines used in fast RAID rebuild and 
+ *  erasure codes.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /**
+ * @brief GF(2^8) vector multiply by constant.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
+ * and src must be aligned to 32B.
+ * @requires SSE4.1
+ *
+ * @param len   Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src   Pointer to src data array. Must be aligned to 32B.
+ * @param dest  Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ */
+
+int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest);
+
+
+ /**
+ * @brief GF(2^8) vector multiply by constant.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
+ * and src must be aligned to 32B.
+ * @requires AVX
+ *
+ * @param len   Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src   Pointer to src data array. Must be aligned to 32B.
+ * @param dest  Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ */
+
+int gf_vect_mul_avx(int len, unsigned char *gftbl, void *src, void *dest);
+
+
+/**
+ * @brief GF(2^8) vector multiply by constant, runs appropriate version.
+ * 	
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }.
+ * Len and src must be aligned to 32B.
+ *
+ * This function determines what instruction sets are enabled 
+ * and selects the appropriate version at runtime. 
+ * 
+ * @param len   Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src   Pointer to src data array. Must be aligned to 32B.
+ * @param dest  Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ */
+
+int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest);
+
+
+/**
+ * @brief Initialize 32-byte constant array for GF(2^8) vector multiply
+ *
+ * Calculates array {C{00}, C{01}, C{02}, ... , C{0f} }, {C{00}, C{10},
+ * C{20}, ... , C{f0} } as required by other fast vector multiply
+ * functions.
+ * @param c     Constant input.
+ * @param gftbl Table output.
+ */
+
+void gf_vect_mul_init(unsigned char c, unsigned char* gftbl);
+
+
+/**
+ * @brief GF(2^8) vector multiply by constant, runs baseline version.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
+ * and src must be aligned to 32B.
+ *
+ * @param len   Length of vector in bytes. Must be aligned to 32B.
+ * @param a 	Pointer to 32-byte array of pre-calculated constants based on C.
+ * 		only use 2nd element is used.
+ * @param src   Pointer to src data array. Must be aligned to 32B.
+ * @param dest  Pointer to destination data array. Must be aligned to 32B.
+ */
+
+void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, 
+			unsigned char *dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_GF_VECT_MUL_H
diff --git a/src/isa-l/include/igzip_lib.h b/src/isa-l/include/igzip_lib.h
new file mode 100644
index 00000000..3cba3faf
--- /dev/null
+++ b/src/isa-l/include/igzip_lib.h
@@ -0,0 +1,636 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _IGZIP_H
+#define _IGZIP_H
+
+/**
+ * @file igzip_lib.h
+ *
+ * @brief This file defines the igzip compression and decompression interface, a
+ * high performance deflate compression interface for storage applications.
+ *
+ * Deflate is a widely used compression standard that can be used standalone, it
+ * also forms the basis of gzip and zlib compression formats. Igzip supports the
+ * following flush features:
+ *
+ * - No Flush: The default method where no special flush is performed.
+ *
+ * - Sync flush: whereby isal_deflate() finishes the current deflate block at
+ *   the end of each input buffer. The deflate block is byte aligned by
+ *   appending an empty stored block.
+ *
+ * - Full flush: whereby isal_deflate() finishes and aligns the deflate block as
+ *   in sync flush but also ensures that subsequent block's history does not
+ *   look back beyond this point and new blocks are fully independent.
+ *
+ * Igzip also supports compression levels from ISAL_DEF_MIN_LEVEL to
+ * ISAL_DEF_MAX_LEVEL.
+ *
+ * Igzip contains some behaviour configurable at compile time. These
+ * configureable options are:
+ *
+ * - IGZIP_HIST_SIZE - Defines the window size. The default value is 32K (note K
+ *   represents 1024), but 8K is also supported. Powers of 2 which are at most
+ *   32K may also work.
+ *
+ * - LONGER_HUFFTABLES - Defines whether to use a larger hufftables structure
+ *   which may increase performance with smaller IGZIP_HIST_SIZE values. By
+ *   default this optoin is not defined. This define sets IGZIP_HIST_SIZE to be
+ *   8 if IGZIP_HIST_SIZE > 8K.
+ *
+ *   As an example, to compile gzip with an 8K window size, in a terminal run
+ *   @verbatim gmake D="-D IGZIP_HIST_SIZE=8*1024" @endverbatim on Linux and
+ *   FreeBSD, or with @verbatim nmake -f Makefile.nmake D="-D
+ *   IGZIP_HIST_SIZE=8*1024" @endverbatim on Windows.
+ *
+ */
+#include <stdint.h>
+#include "types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+/* Deflate Compression Standard Defines */
+/******************************************************************************/
+#define IGZIP_K  1024
+#define ISAL_DEF_MAX_HDR_SIZE 328
+#define ISAL_DEF_MAX_CODE_LEN 15
+#define ISAL_DEF_HIST_SIZE (32*IGZIP_K)
+
+#define ISAL_DEF_LIT_SYMBOLS 257
+#define ISAL_DEF_LEN_SYMBOLS 29
+#define ISAL_DEF_DIST_SYMBOLS 30
+#define ISAL_DEF_LIT_LEN_SYMBOLS (ISAL_DEF_LIT_SYMBOLS + ISAL_DEF_LEN_SYMBOLS)
+
+#define ISAL_LOOK_AHEAD (18 * 16)	/* Max repeat length, rounded up to 32 byte boundary */
+
+/******************************************************************************/
+/* Deflate Implemenation Specific Defines */
+/******************************************************************************/
+/* Note IGZIP_HIST_SIZE must be a power of two */
+#ifndef IGZIP_HIST_SIZE
+#define IGZIP_HIST_SIZE ISAL_DEF_HIST_SIZE
+#endif
+
+#if (IGZIP_HIST_SIZE > ISAL_DEF_HIST_SIZE)
+#undef IGZIP_HIST_SIZE
+#define IGZIP_HIST_SIZE ISAL_DEF_HIST_SIZE
+#endif
+
+#ifdef LONGER_HUFFTABLE
+#if (IGZIP_HIST_SIZE > 8 * IGZIP_K)
+#undef IGZIP_HIST_SIZE
+#define IGZIP_HIST_SIZE (8 * IGZIP_K)
+#endif
+#endif
+
+#define ISAL_LIMIT_HASH_UPDATE
+
+#ifndef IGZIP_HASH_SIZE
+#define IGZIP_HASH_SIZE  (8 * IGZIP_K)
+#endif
+
+#ifdef LONGER_HUFFTABLE
+enum {IGZIP_DIST_TABLE_SIZE = 8*1024};
+
+/* DECODE_OFFSET is dist code index corresponding to DIST_TABLE_SIZE + 1 */
+enum { IGZIP_DECODE_OFFSET = 26 };
+#else
+enum {IGZIP_DIST_TABLE_SIZE = 2};
+/* DECODE_OFFSET is dist code index corresponding to DIST_TABLE_SIZE + 1 */
+enum { IGZIP_DECODE_OFFSET = 0 };
+#endif
+enum {IGZIP_LEN_TABLE_SIZE = 256};
+enum {IGZIP_LIT_TABLE_SIZE = ISAL_DEF_LIT_SYMBOLS};
+
+#define IGZIP_HUFFTABLE_CUSTOM 0
+#define IGZIP_HUFFTABLE_DEFAULT 1
+#define IGZIP_HUFFTABLE_STATIC 2
+
+/* Flush Flags */
+#define NO_FLUSH	0	/* Default */
+#define SYNC_FLUSH	1
+#define FULL_FLUSH	2
+#define FINISH_FLUSH	0	/* Deprecated */
+
+/* Gzip Flags */
+#define IGZIP_DEFLATE	0	/* Default */
+#define IGZIP_GZIP	1
+#define IGZIP_GZIP_NO_HDR	2
+
+/* Compression Return values */
+#define COMP_OK 0
+#define INVALID_FLUSH -7
+#define INVALID_PARAM -8
+#define STATELESS_OVERFLOW -1
+#define ISAL_INVALID_OPERATION -9
+#define ISAL_INVALID_LEVEL -4	/* Invalid Compression level set */
+
+/**
+ *  @enum isal_zstate_state
+ *  @brief Compression State please note ZSTATE_TRL only applies for GZIP compression
+ */
+
+
+/* When the state is set to ZSTATE_NEW_HDR or TMP_ZSTATE_NEW_HEADER, the
+ * hufftable being used for compression may be swapped
+ */
+enum isal_zstate_state {
+	ZSTATE_NEW_HDR,  //!< Header to be written
+	ZSTATE_HDR,	//!< Header state
+	ZSTATE_CREATE_HDR, //!< Header to be created
+	ZSTATE_BODY,	//!< Body state
+	ZSTATE_FLUSH_READ_BUFFER, //!< Flush buffer
+	ZSTATE_FLUSH_ICF_BUFFER,
+	ZSTATE_SYNC_FLUSH, //!< Write sync flush block
+	ZSTATE_FLUSH_WRITE_BUFFER, //!< Flush bitbuf
+	ZSTATE_TRL,	//!< Trailer state
+	ZSTATE_END,	//!< End state
+	ZSTATE_TMP_NEW_HDR, //!< Temporary Header to be written
+	ZSTATE_TMP_HDR,	//!< Temporary Header state
+	ZSTATE_TMP_CREATE_HDR, //!< Temporary Header to be created state
+	ZSTATE_TMP_BODY,	//!< Temporary Body state
+	ZSTATE_TMP_FLUSH_READ_BUFFER, //!< Flush buffer
+	ZSTATE_TMP_FLUSH_ICF_BUFFER,
+	ZSTATE_TMP_SYNC_FLUSH, //!< Write sync flush block
+	ZSTATE_TMP_FLUSH_WRITE_BUFFER, //!< Flush bitbuf
+	ZSTATE_TMP_TRL,	//!< Temporary Trailer state
+	ZSTATE_TMP_END	//!< Temporary End state
+};
+
+/* Offset used to switch between TMP states and non-tmp states */
+#define ZSTATE_TMP_OFFSET ZSTATE_TMP_HDR - ZSTATE_HDR
+
+/******************************************************************************/
+/* Inflate Implementation Specific Defines */
+/******************************************************************************/
+#define ISAL_DECODE_LONG_BITS 12
+#define ISAL_DECODE_SHORT_BITS 10
+
+/* Current state of decompression */
+enum isal_block_state {
+	ISAL_BLOCK_NEW_HDR,	/* Just starting a new block */
+	ISAL_BLOCK_HDR,		/* In the middle of reading in a block header */
+	ISAL_BLOCK_TYPE0,	/* Decoding a type 0 block */
+	ISAL_BLOCK_CODED,	/* Decoding a huffman coded block */
+	ISAL_BLOCK_INPUT_DONE,	/* Decompression of input is completed */
+	ISAL_BLOCK_FINISH	/* Decompression of input is completed and all data has been flushed to output */
+};
+
+/* Inflate Return values */
+#define ISAL_DECOMP_OK 0	/* No errors encountered while decompressing */
+#define ISAL_END_INPUT 1	/* End of input reached */
+#define ISAL_OUT_OVERFLOW 2	/* End of output reached */
+#define ISAL_INVALID_BLOCK -1	/* Invalid deflate block found */
+#define ISAL_INVALID_SYMBOL -2	/* Invalid deflate symbol found */
+#define ISAL_INVALID_LOOKBACK -3	/* Invalid lookback distance found */
+
+/******************************************************************************/
+/* Compression structures */
+/******************************************************************************/
+/** @brief Holds histogram of deflate symbols*/
+struct isal_huff_histogram {
+	uint64_t lit_len_histogram[ISAL_DEF_LIT_LEN_SYMBOLS]; //!< Histogram of Literal/Len symbols seen
+	uint64_t dist_histogram[ISAL_DEF_DIST_SYMBOLS]; //!< Histogram of Distance Symbols seen
+	uint16_t hash_table[IGZIP_HASH_SIZE]; //!< Tmp space used as a hash table
+};
+
+struct isal_mod_hist {
+    uint32_t d_hist[30];
+    uint32_t ll_hist[513];
+};
+
+#define ISAL_DEF_MIN_LEVEL 0
+#define ISAL_DEF_MAX_LEVEL 1
+
+/* Defines used set level data sizes */
+#define ISAL_DEF_LVL0_REQ 0
+#define ISAL_DEF_LVL1_REQ 4 * IGZIP_K /* has to be at least sizeof(struct level_2_buf) */
+#define ISAL_DEF_LVL1_TOKEN_SIZE 4
+
+/* Data sizes for level specific data options */
+#define ISAL_DEF_LVL0_MIN ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_SMALL ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_MEDIUM ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_LARGE ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_EXTRA_LARGE ISAL_DEF_LVL0_REQ
+#define ISAL_DEF_LVL0_DEFAULT ISAL_DEF_LVL0_REQ
+
+#define ISAL_DEF_LVL1_MIN (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 1 * IGZIP_K)
+#define ISAL_DEF_LVL1_SMALL (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 16 * IGZIP_K)
+#define ISAL_DEF_LVL1_MEDIUM (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 32 * IGZIP_K)
+#define ISAL_DEF_LVL1_LARGE (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 64 * IGZIP_K)
+#define ISAL_DEF_LVL1_EXTRA_LARGE (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 128 * IGZIP_K)
+#define ISAL_DEF_LVL1_DEFAULT ISAL_DEF_LVL1_LARGE
+
+/** @brief Holds Bit Buffer information*/
+struct BitBuf2 {
+	uint64_t m_bits;	//!< bits in the bit buffer
+	uint32_t m_bit_count;	//!< number of valid bits in the bit buffer
+	uint8_t *m_out_buf;	//!< current index of buffer to write to
+	uint8_t *m_out_end;	//!< end of buffer to write to
+	uint8_t *m_out_start;	//!< start of buffer to write to
+};
+
+/* Variable prefixes:
+ * b_ : Measured wrt the start of the buffer
+ * f_ : Measured wrt the start of the file (aka file_start)
+ */
+
+/** @brief Holds the internal state information for input and output compression streams*/
+struct isal_zstate {
+	uint32_t b_bytes_valid;	//!< number of bytes of valid data in buffer
+	uint32_t b_bytes_processed;	//!< keeps track of the number of bytes processed in isal_zstate.buffer
+	uint8_t *file_start;	//!< pointer to where file would logically start
+	uint32_t crc;		//!< Current crc
+	struct BitBuf2 bitbuf;	//!< Bit Buffer
+	enum isal_zstate_state state;	//!< Current state in processing the data stream
+	uint32_t count;	//!< used for partial header/trailer writes
+	uint8_t tmp_out_buff[16];	//!< temporary array
+	uint32_t tmp_out_start;	//!< temporary variable
+	uint32_t tmp_out_end;	//!< temporary variable
+	uint32_t has_eob;	//!< keeps track of eob on the last deflate block
+	uint32_t has_eob_hdr;	//!< keeps track of eob hdr (with BFINAL set)
+	uint32_t has_hist;	//!< flag to track if there is match history
+
+	struct isal_mod_hist hist;
+
+	DECLARE_ALIGNED(uint8_t buffer[2 * IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD], 32);	//!< Internal buffer
+	DECLARE_ALIGNED(uint16_t head[IGZIP_HASH_SIZE], 16);	//!< Hash array
+
+};
+
+/** @brief Holds the huffman tree used to huffman encode the input stream **/
+struct isal_hufftables {
+
+	uint8_t deflate_hdr[ISAL_DEF_MAX_HDR_SIZE]; //!< deflate huffman tree header
+	uint32_t deflate_hdr_count; //!< Number of whole bytes in deflate_huff_hdr
+	uint32_t deflate_hdr_extra_bits; //!< Number of bits in the partial byte in header
+	uint32_t dist_table[IGZIP_DIST_TABLE_SIZE]; //!< bits 4:0 are the code length, bits 31:5 are the code
+	uint32_t len_table[IGZIP_LEN_TABLE_SIZE]; //!< bits 4:0 are the code length, bits 31:5 are the code
+	uint16_t lit_table[IGZIP_LIT_TABLE_SIZE]; //!< literal code
+	uint8_t lit_table_sizes[IGZIP_LIT_TABLE_SIZE]; //!< literal code length
+	uint16_t dcodes[30 - IGZIP_DECODE_OFFSET]; //!< distance code
+	uint8_t dcodes_sizes[30 - IGZIP_DECODE_OFFSET]; //!< distance code length
+
+};
+
+/** @brief Holds stream information*/
+struct isal_zstream {
+	uint8_t *next_in;	//!< Next input byte
+	uint32_t avail_in;	//!< number of bytes available at next_in
+	uint32_t total_in;	//!< total number of bytes read so far
+
+	uint8_t *next_out;	//!< Next output byte
+	uint32_t avail_out;	//!< number of bytes available at next_out
+	uint32_t total_out;	//!< total number of bytes written so far
+
+	struct isal_hufftables *hufftables; //!< Huffman encoding used when compressing
+	uint32_t level; //!< Compression level to use
+	uint32_t level_buf_size; //!< Size of level_buf
+	uint8_t * level_buf; //!< User allocated buffer required for different compression levels
+	uint32_t end_of_stream;	//!< non-zero if this is the last input buffer
+	uint32_t flush;	//!< Flush type can be NO_FLUSH, SYNC_FLUSH or FULL_FLUSH
+	uint32_t gzip_flag; //!< Indicate if gzip compression is to be performed
+
+	struct isal_zstate internal_state;	//!< Internal state for this stream
+};
+
+/******************************************************************************/
+/* Inflate structures */
+/******************************************************************************/
+/*
+ * Inflate_huff_code data structures are used to store a Huffman code for fast
+ * lookup. It works by performing a lookup in small_code_lookup that hopefully
+ * yields the correct symbol. Otherwise a lookup into long_code_lookup is
+ * performed to find the correct symbol. The details of how this works follows:
+ *
+ * Let i be some index into small_code_lookup and let e be the associated
+ * element.  Bit 15 in e is a flag. If bit 15 is not set, then index i contains
+ * a Huffman code for a symbol which has length at most DECODE_LOOKUP_SIZE. Bits
+ * 0 through 8 are the symbol associated with that code and bits 9 through 12 of
+ * e represent the number of bits in the code. If bit 15 is set, the i
+ * corresponds to the first DECODE_LOOKUP_SIZE bits of a Huffman code which has
+ * length longer than DECODE_LOOKUP_SIZE. In this case, bits 0 through 8
+ * represent an offset into long_code_lookup table and bits 9 through 12
+ * represent the maximum length of a Huffman code starting with the bits in the
+ * index i. The offset into long_code_lookup is for an array associated with all
+ * codes which start with the bits in i.
+ *
+ * The elements of long_code_lookup are in the same format as small_code_lookup,
+ * except bit 15 is never set. Let i be a number made up of DECODE_LOOKUP_SIZE
+ * bits.  Then all Huffman codes which start with DECODE_LOOKUP_SIZE bits are
+ * stored in an array starting at index h in long_code_lookup. This index h is
+ * stored in bits 0 through 9 at index i in small_code_lookup. The index j is an
+ * index of this array if the number of bits contained in j and i is the number
+ * of bits in the longest huff_code starting with the bits of i. The symbol
+ * stored at index j is the symbol whose huffcode can be found in (j <<
+ * DECODE_LOOKUP_SIZE) | i. Note these arrays will be stored sorted in order of
+ * maximum Huffman code length.
+ *
+ * The following are explanations for sizes of the tables:
+ *
+ * Since small_code_lookup is a lookup on DECODE_LOOKUP_SIZE bits, it must have
+ * size 2^DECODE_LOOKUP_SIZE.
+ *
+ * Since deflate Huffman are stored such that the code size and the code value
+ * form an increasing function, At most 2^(15 - DECODE_LOOKUP_SIZE) - 1 elements
+ * of long_code_lookup duplicate an existing symbol. Since there are at most 285
+ * - DECODE_LOOKUP_SIZE possible symbols contained in long_code lookup. Rounding
+ * this to the nearest 16 byte boundary yields the size of long_code_lookup of
+ * 288 + 2^(15 - DECODE_LOOKUP_SIZE).
+ *
+ * Note that DECODE_LOOKUP_SIZE can be any length even though the offset in
+ * small_lookup_code is 9 bits long because the increasing relationship between
+ * code length and code value forces the maximum offset to be less than 288.
+ */
+
+/* Large lookup table for decoding huffman codes */
+struct inflate_huff_code_large {
+	uint16_t short_code_lookup[1 << (ISAL_DECODE_LONG_BITS)];
+	uint16_t long_code_lookup[288 + (1 << (15 - ISAL_DECODE_LONG_BITS))];
+};
+
+/* Small lookup table for decoding huffman codes */
+struct inflate_huff_code_small {
+	uint16_t short_code_lookup[1 << (ISAL_DECODE_SHORT_BITS)];
+	uint16_t long_code_lookup[32 + (1 << (15 - ISAL_DECODE_SHORT_BITS))];
+};
+
+/** @brief Holds decompression state information*/
+struct inflate_state {
+	uint8_t *next_out;	//!< Next output Byte
+	uint32_t avail_out;	//!< Number of bytes available at next_out 
+	uint32_t total_out;	//!< Total bytes written out so far
+	uint8_t *next_in;	//!< Next input byte
+	uint64_t read_in;	//!< Bits buffered to handle unaligned streams
+	uint32_t avail_in;	//!< Number of bytes available at next_in
+	int32_t read_in_length;	//!< Bits in read_in
+	struct inflate_huff_code_large lit_huff_code;	//!< Structure for decoding lit/len symbols
+	struct inflate_huff_code_small dist_huff_code;	//!< Structure for decoding dist symbols
+	enum isal_block_state block_state;	//!< Current decompression state
+	uint32_t bfinal;	//!< Flag identifying final block
+	uint32_t crc_flag;	//!< Flag identifying whether to track of crc
+	uint32_t crc;		//!< Contains crc of output if crc_flag is set
+	int32_t type0_block_len;	//!< Length left to read of type 0 block when outbuffer overflow occured
+	int32_t copy_overflow_length; 	//!< Length left to copy when outbuffer overflow occured
+	int32_t copy_overflow_distance;	//!< Lookback distance when outbuffer overlow occured
+	int32_t tmp_in_size;	//!< Number of bytes in tmp_in_buffer
+	int32_t tmp_out_valid;	//!< Number of bytes in tmp_out_buffer
+	int32_t tmp_out_processed;	//!< Number of bytes processed in tmp_out_buffer
+	uint8_t tmp_in_buffer[ISAL_DEF_MAX_HDR_SIZE];	//!< Temporary buffer containing data from the input stream
+	uint8_t tmp_out_buffer[2 * ISAL_DEF_HIST_SIZE + ISAL_LOOK_AHEAD]; 	//!< Temporary buffer containing data from the output stream
+};
+
+/******************************************************************************/
+/* Compression functions */
+/******************************************************************************/
+/**
+ * @brief Updates histograms to include the symbols found in the input
+ * stream. Since this function only updates the histograms, it can be called on
+ * multiple streams to get a histogram better representing the desired data
+ * set. When first using histogram it must be initialized by zeroing the
+ * structure.
+ *
+ * @param in_stream: Input stream of data.
+ * @param length: The length of start_stream.
+ * @param histogram: The returned histogram of lit/len/dist symbols.
+ */
+void isal_update_histogram(uint8_t * in_stream, int length, struct isal_huff_histogram * histogram);
+
+
+/**
+ * @brief Creates a custom huffman code for the given histograms in which
+ *  every literal and repeat length is assigned a code and all possible lookback
+ *  distances are assigned a code.
+ *
+ * @param hufftables: the output structure containing the huffman code
+ * @param histogram: histogram containing frequency of literal symbols,
+ *        repeat lengths and lookback distances
+ * @returns Returns a non zero value if an invalid huffman code was created.
+ */
+int isal_create_hufftables(struct isal_hufftables * hufftables,
+			struct isal_huff_histogram * histogram);
+
+/**
+ * @brief Creates a custom huffman code for the given histograms like
+ * isal_create_hufftables() except literals with 0 frequency in the histogram
+ * are not assigned a code
+ *
+ * @param hufftables: the output structure containing the huffman code
+ * @param histogram: histogram containing frequency of literal symbols,
+ *        repeat lengths and lookback distances
+ * @returns Returns a non zero value if an invalid huffman code was created.
+ */
+int isal_create_hufftables_subset(struct isal_hufftables * hufftables,
+				struct isal_huff_histogram * histogram);
+
+/**
+ * @brief Initialize compression stream data structure
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @returns none
+ */
+void isal_deflate_init(struct isal_zstream *stream);
+
+/**
+ * @brief Set stream to use a new Huffman code
+ *
+ * Sets the Huffman code to be used in compression before compression start or
+ * after the sucessful completion of a SYNC_FLUSH or FULL_FLUSH. If type has
+ * value IGZIP_HUFFTABLE_DEFAULT, the stream is set to use the default Huffman
+ * code. If type has value IGZIP_HUFFTABLE_STATIC, the stream is set to use the
+ * deflate standard static Huffman code, or if type has value
+ * IGZIP_HUFFTABLE_CUSTOM, the stream is set to sue the isal_hufftables
+ * structure input to isal_deflate_set_hufftables.
+ *
+ * @param stream: Structure holding state information on the compression stream.
+ * @param hufftables: new huffman code to use if type is set to
+ * IGZIP_HUFFTABLE_CUSTOM.
+ * @param type: Flag specifying what hufftable to use.
+ *
+ * @returns Returns INVALID_OPERATION if the stream was unmodified. This may be
+ * due to the stream being in a state where changing the huffman code is not
+ * allowed or an invalid input is provided.
+ */
+int isal_deflate_set_hufftables(struct isal_zstream *stream,
+				struct isal_hufftables *hufftables, int type);
+
+/**
+ * @brief Initialize compression stream data structure
+ *
+ * @param stream Structure holding state information on the compression streams.
+ * @returns none
+ */
+void isal_deflate_stateless_init(struct isal_zstream *stream);
+
+
+/**
+ * @brief Fast data (deflate) compression for storage applications.
+ *
+ * The call to isal_deflate() will take data from the input buffer (updating
+ * next_in, avail_in and write a compressed stream to the output buffer
+ * (updating next_out and avail_out). The function returns when either the input
+ * buffer is empty or the output buffer is full.
+ *
+ * On entry to isal_deflate(), next_in points to an input buffer and avail_in
+ * indicates the length of that buffer. Similarly next_out points to an empty
+ * output buffer and avail_out indicates the size of that buffer.
+ *
+ * The fields total_in and total_out start at 0 and are updated by
+ * isal_deflate(). These reflect the total number of bytes read or written so far.
+ *
+ * When the last input buffer is passed in, signaled by setting the
+ * end_of_stream, the routine will complete compression at the end of the input
+ * buffer, as long as the output buffer is big enough.
+ *
+ * The compression level can be set by setting level to any value between
+ * ISAL_DEF_MIN_LEVEL and ISAL_DEF_MAX_LEVEL. When the compression level is
+ * ISAL_DEF_MIN_LEVEL, hufftables can be set to a table trained for the the
+ * specific data type being compressed to achieve better compression. When a
+ * higher compression level is desired, a larger generic memory buffer needs to
+ * be supplied by setting level_buf and level_buf_size to represent the chunk of
+ * memory. For level x, the suggest size for this buffer this buffer is
+ * ISAL_DEFL_LVLx_DEFAULT. The defines ISAL_DEFL_LVLx_MIN, ISAL_DEFL_LVLx_SMALL,
+ * ISAL_DEFL_LVLx_MEDIUM, ISAL_DEFL_LVLx_LARGE, and ISAL_DEFL_LVLx_EXTRA_LARGE
+ * are also provided as other suggested sizes.
+ *
+ * The equivalent of the zlib FLUSH_SYNC operation is currently supported.
+ * Flush types can be NO_FLUSH, SYNC_FLUSH or FULL_FLUSH. Default flush type is
+ * NO_FLUSH. A SYNC_ OR FULL_ flush will byte align the deflate block by
+ * appending an empty stored block once all input has been compressed, including
+ * the buffered input. Checking that the out_buffer is not empty or that
+ * internal_state.state = ZSTATE_NEW_HDR is sufficient to guarantee all input
+ * has been flushed. Additionally FULL_FLUSH will ensure look back history does
+ * not include previous blocks so new blocks are fully independent. Switching
+ * between flush types is supported.
+ *
+ * If the gzip_flag is set to IGZIP_GZIP, a generic gzip header and the gzip
+ * trailer are written around the deflate compressed data. If gzip_flag is set
+ * to IGZIP_GZIP_NO_HDR, then only the gzip trailer is written.
+ *
+ * @param  stream Structure holding state information on the compression streams.
+ * @return COMP_OK (if everything is ok),
+ *         INVALID_FLUSH (if an invalid FLUSH is selected),
+ *         ISAL_INVALID_LEVEL (if an invalid compression level is selected).
+ */
+int isal_deflate(struct isal_zstream *stream);
+
+
+/**
+ * @brief Fast data (deflate) stateless compression for storage applications.
+ *
+ * Stateless (one shot) compression routine with a similar interface to
+ * isal_deflate() but operates on entire input buffer at one time. Parameter
+ * avail_out must be large enough to fit the entire compressed output. Max
+ * expansion is limited to the input size plus the header size of a stored/raw
+ * block.
+ *
+ * When the compression level is set to 1, unlike in isal_deflate(), level_buf
+ * may be optionally set depending on what what permormance is desired.
+ *
+ * For stateless the flush types NO_FLUSH and FULL_FLUSH are supported.
+ * FULL_FLUSH will byte align the output deflate block so additional blocks can
+ * be easily appended.
+ *
+ * If the gzip_flag is set to IGZIP_GZIP, a generic gzip header and the gzip
+ * trailer are written around the deflate compressed data. If gzip_flag is set
+ * to IGZIP_GZIP_NO_HDR, then only the gzip trailer is written.
+ *
+ * @param  stream Structure holding state information on the compression streams.
+ * @return COMP_OK (if everything is ok),
+ *         INVALID_FLUSH (if an invalid FLUSH is selected),
+ *         ISAL_INVALID_LEVEL (if an invalid compression level is selected),
+ *         STATELESS_OVERFLOW (if output buffer will not fit output).
+ */
+int isal_deflate_stateless(struct isal_zstream *stream);
+
+
+/******************************************************************************/
+/* Inflate functions */
+/******************************************************************************/
+/**
+ * @brief Initialize decompression state data structure
+ *
+ * @param state Structure holding state information on the compression streams.
+ * @returns none
+ */
+void isal_inflate_init(struct inflate_state *state);
+
+/**
+ * @brief Fast data (deflate) decompression for storage applications.
+ *
+ * On entry to isal_inflate(), next_in points to an input buffer and avail_in
+ * indicates the length of that buffer. Similarly next_out points to an empty
+ * output buffer and avail_out indicates the size of that buffer.
+ *
+ * The field total_out starts at 0 and is updated by isal_inflate(). This
+ * reflects the total number of bytes written so far.
+ *
+ * The call to isal_inflate() will take data from the input buffer (updating
+ * next_in, avail_in and write a decompressed stream to the output buffer
+ * (updating next_out and avail_out). The function returns when the input buffer
+ * is empty, the output buffer is full or invalid data is found. The current
+ * state of the decompression on exit can be read from state->block-state. If
+ * the crc_flag is set, the gzip crc of the output is stored in state->crc.
+ *
+ * @param  state Structure holding state information on the compression streams.
+ * @return ISAL_DECOMP_OK (if everything is ok),
+ *         ISAL_END_INPUT (if all input was decompressed),
+ *         ISAL_OUT_OVERFLOW (if output buffer ran out of space),
+ *         ISAL_INVALID_BLOCK,
+ *         ISAL_INVALID_SYMBOL,
+ *         ISAL_INVALID_LOOKBACK.
+ */
+int isal_inflate(struct inflate_state *state);
+
+/**
+ * @brief Fast data (deflate) stateless decompression for storage applications.
+ *
+ * Stateless (one shot) decompression routine with a similar interface to
+ * isal_inflate() but operates on entire input buffer at one time. Parameter
+ * avail_out must be large enough to fit the entire decompressed output.
+ *
+ * @param  state Structure holding state information on the compression streams.
+ * @return ISAL_DECOMP_OK (if everything is ok),
+ *         ISAL_END_INPUT (if all input was decompressed),
+ *         ISAL_OUT_OVERFLOW (if output buffer ran out of space),
+ *         ISAL_INVALID_BLOCK,
+ *         ISAL_INVALID_SYMBOL,
+ *         ISAL_INVALID_LOOKBACK.
+ */
+int isal_inflate_stateless(struct inflate_state *state);
+
+#ifdef __cplusplus
+}
+#endif
+#endif	/* ifndef _IGZIP_H */
diff --git a/src/isa-l/include/multibinary.asm b/src/isa-l/include/multibinary.asm
new file mode 100644
index 00000000..7fca3a14
--- /dev/null
+++ b/src/isa-l/include/multibinary.asm
@@ -0,0 +1,307 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _MULTIBINARY_ASM_
+%define _MULTIBINARY_ASM_
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ %define mbin_def_ptr	dd
+ %define mbin_ptr_sz	dword
+ %define mbin_rdi	edi
+ %define mbin_rsi	esi
+ %define mbin_rax	eax
+ %define mbin_rbx	ebx
+ %define mbin_rcx	ecx
+ %define mbin_rdx	edx
+%else
+ %define mbin_def_ptr	dq
+ %define mbin_ptr_sz	qword
+ %define mbin_rdi	rdi
+ %define mbin_rsi	rsi
+ %define mbin_rax	rax
+ %define mbin_rbx	rbx
+ %define mbin_rcx	rcx
+ %define mbin_rdx	rdx
+%endif
+
+;;;;
+; multibinary macro:
+;   creates the visable entry point that uses HW optimized call pointer
+;   creates the init of the HW optimized call pointer
+;;;;
+%macro mbin_interface 1
+	;;;;
+	; *_dispatched is defaulted to *_mbinit and replaced on first call.
+	; Therefore, *_dispatch_init is only executed on first call.
+	;;;;
+	section .data
+	%1_dispatched:
+		mbin_def_ptr	%1_mbinit
+
+	section .text
+	global %1:function
+	%1_mbinit:
+		;;; only called the first time to setup hardware match
+		call	%1_dispatch_init
+		;;; falls thru to execute the hw optimized code
+	%1:
+		jmp	mbin_ptr_sz [%1_dispatched]
+%endmacro
+
+;;;;;
+; mbin_dispatch_init parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+;;;;;
+%macro mbin_dispatch_init 4
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+		mov	eax, 1
+		cpuid
+		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		lea	mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+		jne	_%1_init_done ; AVX is not available so end
+		mov	mbin_rsi, mbin_rbx
+
+		;; Try for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		;; Does it have xmm and ymm support
+		xor	ecx, ecx
+		xgetbv
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		je	_%1_init_done
+		lea	mbin_rsi, [%2 WRT_OPT]
+
+	_%1_init_done:
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init2 parameters
+;  Cases where only base functions are available
+; 1-> function name
+; 2-> base function
+;;;;;
+%macro mbin_dispatch_init2 2
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init_clmul 3 parameters
+; Use this case for CRC which needs both SSE4_1 and CLMUL
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 and CLMUL optimized function
+;;;;;
+%macro mbin_dispatch_init_clmul 3
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		lea     mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov     eax, 1
+		cpuid
+		lea	mbin_rbx, [%3 WRT_OPT] ; SSE opt func
+
+		; Test for SSE4.2
+		test	ecx, FLAG_CPUID1_ECX_SSE4_1
+		jz	_%1_init_done
+		test    ecx, FLAG_CPUID1_ECX_CLMUL
+		cmovne	mbin_rsi, mbin_rbx
+	_%1_init_done:
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init5 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+;;;;;
+%macro mbin_dispatch_init5 5
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		; Test for SSE4.2
+		test	ecx, FLAG_CPUID1_ECX_SSE4_2
+		lea	mbin_rbx, [%3 WRT_OPT] ; SSE opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
+		jne	_%1_init_done ; AVX is not available so end
+		mov	mbin_rsi, mbin_rbx
+
+		;; Try for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		lea	mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		;; Does it have xmm and ymm support
+		xor	ecx, ecx
+		xgetbv
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		je	_%1_init_done
+		lea	mbin_rsi, [%3 WRT_OPT]
+
+	_%1_init_done:
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init6 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+;;;;;
+%macro mbin_dispatch_init6 6
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		push	mbin_rdi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		mov	ebx, ecx ; save cpuid1.ecx
+		test	ecx, FLAG_CPUID1_ECX_SSE4_2
+		je	_%1_init_done	  ; Use base function if no SSE4_2
+		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+		;; Test for XMM_YMM support/AVX
+		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
+		je	_%1_init_done
+		xor	ecx, ecx
+		xgetbv	; xcr -> edx:eax
+		mov	edi, eax	  ; save xgetvb.eax
+
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		jne	_%1_init_done
+		test	ebx, FLAG_CPUID1_ECX_AVX
+		je	_%1_init_done
+		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+		;; Test for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		je	_%1_init_done		; No AVX2 possible
+		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
+
+		;; Test for AVX512
+		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		jne	_%1_init_done	  ; No AVX512 possible
+		and	ebx, FLAGS_CPUID7_ECX_AVX512_G1
+		cmp	ebx, FLAGS_CPUID7_ECX_AVX512_G1
+		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+		cmove	mbin_rsi, mbin_rbx
+
+	_%1_init_done:
+		pop	mbin_rdi
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+%endif ; ifndef _MULTIBINARY_ASM_
diff --git a/src/isa-l/include/raid.h b/src/isa-l/include/raid.h
new file mode 100644
index 00000000..192fca28
--- /dev/null
+++ b/src/isa-l/include/raid.h
@@ -0,0 +1,302 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _RAID_H_
+#define _RAID_H_
+
+/**
+ *  @file  raid.h
+ *  @brief Interface to RAID functions - XOR and P+Q calculation.
+ *
+ *  This file defines the interface to optimized XOR calculation (RAID5) or P+Q
+ *  dual parity (RAID6).  Operations are carried out on an array of pointers to
+ *  sources and output arrays.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Multi-binary functions */
+
+/**
+ * @brief Generate XOR parity vector from N sources, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to source and dest. For XOR the dest is
+ *                the last pointer. ie array[vects-1]. Src and dest
+ *                pointers must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array has XOR parity sum of 0 across all vectors, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects   Number of vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to vectors. Src and dest pointers
+ *                must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_check(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 32B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ *                is the last two pointers. ie array[vects-2],
+ *                array[vects-1].  P and Q parity vectors are
+ *                written to these last two pointers. Src and dest
+ *                pointers must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs appropriate version.
+ *
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param vects  Number of vectors in array including P&Q.
+ * @param len    Length of each vector in bytes. Must be 16B aligned.
+ * @param array  Array of pointers to source and P, Q. P and Q parity
+ *               are assumed to be the last two pointers in the array.
+ *               All pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_check(int vects, int len, void **array);
+
+
+/* Arch specific versions */
+
+/**
+ * @brief Generate XOR parity vector from N sources.
+ * @requires SSE4.1
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to source and dest. For XOR the dest is
+ *                the last pointer. ie array[vects-1]. Src and dest pointers
+ *                must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate XOR parity vector from N sources.
+ * @requires AVX
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to source and dest. For XOR the dest is
+ *                the last pointer. ie array[vects-1]. Src and dest pointers
+ *                must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen_avx(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array has XOR parity sum of 0 across all vectors.
+ * @requires SSE4.1
+ *
+ * @param vects   Number of vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to vectors. Src and dest pointers
+ *                must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_check_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources.
+ * @requires SSE4.1
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 16B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ *                is the last two pointers. ie array[vects-2],
+ *                array[vects-1]. P and Q parity vectors are
+ *                written to these last two pointers. Src and dest
+ *                pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources.
+ * @requires AVX
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 16B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ *                is the last two pointers. ie array[vects-2],
+ *                array[vects-1]. P and Q parity vectors are
+ *                written to these last two pointers. Src and dest
+ *                pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_avx(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources.
+ * @requires AVX2
+ *
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 32B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ *                is the last two pointers. ie array[vects-2],
+ *                array[vects-1]. P and Q parity vectors are
+ *                written to these last two pointers. Src and dest
+ *                pointers must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_avx2(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array of N sources, P and Q are consistent across all vectors.
+ * @requires SSE4.1
+ *
+ * @param vects  Number of vectors in array including P&Q.
+ * @param len    Length of each vector in bytes. Must be 16B aligned.
+ * @param array  Array of pointers to source and P, Q. P and Q parity
+                 are assumed to be the last two pointers in the array.
+                 All pointers must be aligned to 16B.
+ * @returns 0 pass, other fail
+ */
+
+int pq_check_sse(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate P+Q parity vectors from N sources, runs baseline version.
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes. Must be 16B aligned.
+ * @param array   Array of pointers to source and dest. For P+Q the dest
+ * 		  is the last two pointers. ie array[vects-2],
+ * 		  array[vects-1]. P and Q parity vectors are
+ * 		  written to these last two pointers. Src and dest pointers
+ * 		  must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_gen_base(int vects, int len, void **array);
+
+
+/**
+ * @brief Generate XOR parity vector from N sources, runs baseline version.
+ * @param vects   Number of source+dest vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to source and dest. For XOR the dest is
+ * 		  the last pointer. ie array[vects-1]. Src and dest pointers
+ * 		  must be aligned to 32B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_gen_base(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array has XOR parity sum of 0 across all vectors, runs baseline version.
+ *
+ * @param vects   Number of vectors in array.
+ * @param len     Length of each vector in bytes.
+ * @param array   Array of pointers to vectors. Src and dest pointers
+ *                must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int xor_check_base(int vects, int len, void **array);
+
+
+/**
+ * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs baseline version.
+ *
+ * @param vects  Number of vectors in array including P&Q.
+ * @param len    Length of each vector in bytes. Must be 16B aligned.
+ * @param array  Array of pointers to source and P, Q. P and Q parity
+ *               are assumed to be the last two pointers in the array.
+ *               All pointers must be aligned to 16B.
+ *
+ * @returns 0 pass, other fail
+ */
+
+int pq_check_base(int vects, int len, void **array);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_RAID_H_
diff --git a/src/isa-l/include/reg_sizes.asm b/src/isa-l/include/reg_sizes.asm
new file mode 100644
index 00000000..cd689b7f
--- /dev/null
+++ b/src/isa-l/include/reg_sizes.asm
@@ -0,0 +1,149 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
+%ifdef __NASM_VER__
+%ifidn __OUTPUT_FORMAT__, win64
+%error nasm not supported in windows
+%else
+%define endproc_frame
+%endif
+%endif
+
+%define EFLAGS_HAS_CPUID        (1<<21)
+%define FLAG_CPUID1_ECX_CLMUL   (1<<1)
+%define FLAG_CPUID1_EDX_SSE2    (1<<26)
+%define FLAG_CPUID1_ECX_SSE3	(1)
+%define FLAG_CPUID1_ECX_SSE4_1  (1<<19)
+%define FLAG_CPUID1_ECX_SSE4_2  (1<<20)
+%define FLAG_CPUID1_ECX_POPCNT  (1<<23)
+%define FLAG_CPUID1_ECX_AESNI   (1<<25)
+%define FLAG_CPUID1_ECX_OSXSAVE (1<<27)
+%define FLAG_CPUID1_ECX_AVX     (1<<28)
+%define FLAG_CPUID1_EBX_AVX2    (1<<5)
+
+%define FLAG_CPUID7_EBX_AVX2           (1<<5)
+%define FLAG_CPUID7_EBX_AVX512F        (1<<16)
+%define FLAG_CPUID7_EBX_AVX512DQ       (1<<17)
+%define FLAG_CPUID7_EBX_AVX512IFMA     (1<<21)
+%define FLAG_CPUID7_EBX_AVX512PF       (1<<26)
+%define FLAG_CPUID7_EBX_AVX512ER       (1<<27)
+%define FLAG_CPUID7_EBX_AVX512CD       (1<<28)
+%define FLAG_CPUID7_EBX_AVX512BW       (1<<30)
+%define FLAG_CPUID7_EBX_AVX512VL       (1<<31)
+%define FLAG_CPUID7_ECX_AVX512VBMI     (1<<1)
+
+%define FLAGS_CPUID7_ECX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ)
+
+%define FLAG_XGETBV_EAX_XMM            (1<<1)
+%define FLAG_XGETBV_EAX_YMM            (1<<2)
+%define FLAG_XGETBV_EAX_XMM_YMM        0x6
+%define FLAG_XGETBV_EAX_ZMM_OPM        0xe0
+
+%define FLAG_CPUID1_EAX_AVOTON     0x000406d0
+%define FLAG_CPUID1_EAX_STEP_MASK  0xfffffff0
+
+; define d and w variants for registers
+
+%define	raxd	eax
+%define raxw	ax
+%define raxb	al
+
+%define	rbxd	ebx
+%define rbxw	bx
+%define rbxb	bl
+
+%define	rcxd	ecx
+%define rcxw	cx
+%define rcxb	cl
+
+%define	rdxd	edx
+%define rdxw	dx
+%define rdxb	dl
+
+%define	rsid	esi
+%define rsiw	si
+%define rsib	sil
+
+%define	rdid	edi
+%define rdiw	di
+%define rdib	dil
+
+%define	rbpd	ebp
+%define rbpw	bp
+%define rbpb	bpl
+
+%define ymm0x xmm0
+%define ymm1x xmm1
+%define ymm2x xmm2
+%define ymm3x xmm3
+%define ymm4x xmm4
+%define ymm5x xmm5
+%define ymm6x xmm6
+%define ymm7x xmm7
+%define ymm8x xmm8
+%define ymm9x xmm9
+%define ymm10x xmm10
+%define ymm11x xmm11
+%define ymm12x xmm12
+%define ymm13x xmm13
+%define ymm14x xmm14
+%define ymm15x xmm15
+
+%define DWORD(reg) reg %+ d
+%define WORD(reg)  reg %+ w
+%define BYTE(reg)  reg %+ b
+
+%define XWORD(reg) reg %+ x
+
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__, macho64
+%define elf64 macho64
+%endif
+
+%macro slversion 4
+	section .text
+	global %1_slver_%2%3%4
+	global %1_slver
+	%1_slver:
+	%1_slver_%2%3%4:
+		dw 0x%4
+		db 0x%3, 0x%2
+%endmacro
+
+%endif ; ifndef _REG_SIZES_ASM_
diff --git a/src/isa-l/include/test.h b/src/isa-l/include/test.h
new file mode 100644
index 00000000..6f354b31
--- /dev/null
+++ b/src/isa-l/include/test.h
@@ -0,0 +1,81 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _TEST_H
+#define _TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Use sys/time.h functions for time
+
+#include <sys/time.h>
+
+struct perf{
+	struct timeval tv;
+};
+
+
+inline int perf_start(struct perf *p)
+{
+	return gettimeofday(&(p->tv), 0);
+}
+inline int perf_stop(struct perf *p)
+{
+	return gettimeofday(&(p->tv), 0);
+}
+
+inline void perf_print(struct perf stop, struct perf start, long long dsize)
+{
+	long long secs = stop.tv.tv_sec - start.tv.tv_sec;
+	long long usecs = secs * 1000000 + stop.tv.tv_usec - start.tv.tv_usec;
+
+	printf("runtime = %10lld usecs", usecs);
+	if (dsize != 0) {
+#if 1 // not bug in printf for 32-bit
+		printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s\n", dsize/(1024*1024), 
+			((double) usecs)/1000000, ((double) dsize) / (double)usecs);
+#else
+		printf(", bandwidth %lld MB ", dsize/(1024*1024));
+		printf("in %.4f sec ",(double)usecs/1000000);
+		printf("= %.2f MB/s\n", (double)dsize/usecs);
+#endif
+	}
+	else
+		printf("\n");
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _TEST_H
diff --git a/src/isa-l/include/types.h b/src/isa-l/include/types.h
new file mode 100644
index 00000000..41d53554
--- /dev/null
+++ b/src/isa-l/include/types.h
@@ -0,0 +1,88 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ *  @file  types.h
+ *  @brief Defines standard width types.
+ *
+ */
+
+#ifndef __TYPES_H
+#define __TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+#ifdef __MINGW32__
+# include <_mingw.h>
+#endif
+typedef unsigned __int64 UINT64;
+typedef          __int64  INT64;
+typedef unsigned __int32 UINT32;
+typedef unsigned __int16 UINT16;
+typedef unsigned char    UINT8;
+#else
+typedef unsigned long int  UINT64;
+typedef          long int   INT64;
+typedef unsigned int       UINT32;
+typedef unsigned short int UINT16;
+typedef unsigned char      UINT8;
+#endif
+
+
+#if defined  __unix__ || defined __APPLE__
+# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+# define __forceinline static inline
+# define aligned_free(x) free(x)
+#else
+# ifdef __MINGW32__
+#   define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+#   define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+#   define aligned_free(x) _aligned_free(x)
+# else
+#   define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
+#   define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+#   define aligned_free(x) _aligned_free(x)
+# endif
+#endif
+
+#ifdef DEBUG
+# define DEBUG_PRINT(x) printf x
+#else
+# define DEBUG_PRINT(x) do {} while (0)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  //__TYPES_H