diff options
Diffstat (limited to 'src/isa-l/include')
-rw-r--r-- | src/isa-l/include/crc.h | 134 | ||||
-rw-r--r-- | src/isa-l/include/crc64.h | 277 | ||||
-rw-r--r-- | src/isa-l/include/erasure_code.h | 933 | ||||
-rw-r--r-- | src/isa-l/include/gf_vect_mul.h | 148 | ||||
-rw-r--r-- | src/isa-l/include/igzip_lib.h | 636 | ||||
-rw-r--r-- | src/isa-l/include/multibinary.asm | 307 | ||||
-rw-r--r-- | src/isa-l/include/raid.h | 302 | ||||
-rw-r--r-- | src/isa-l/include/reg_sizes.asm | 149 | ||||
-rw-r--r-- | src/isa-l/include/test.h | 81 | ||||
-rw-r--r-- | src/isa-l/include/types.h | 88 |
10 files changed, 3055 insertions, 0 deletions
diff --git a/src/isa-l/include/crc.h b/src/isa-l/include/crc.h new file mode 100644 index 00000000..5d6b0495 --- /dev/null +++ b/src/isa-l/include/crc.h @@ -0,0 +1,134 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +/** + * @file crc.h + * @brief CRC functions. + */ + + +#ifndef _CRC_H_ +#define _CRC_H_ + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Multi-binary functions */ + +/** + * @brief Generate CRC from the T10 standard, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @returns 16 bit CRC + */ +uint16_t crc16_t10dif( + uint16_t init_crc, //!< initial CRC value, 16 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +/** + * @brief Generate CRC from the IEEE standard, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @returns 32 bit CRC + */ + +uint32_t crc32_ieee( + uint32_t init_crc, //!< initial CRC value, 32 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +/** + * @brief ISCSI CRC function, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @returns 32 bit CRC + */ +unsigned int crc32_iscsi( + unsigned char *buffer, //!< buffer to calculate CRC on + int len, //!< buffer length in bytes + unsigned int init_crc //!< initial CRC value + ); + + +/* Base functions */ + +/** + * @brief ISCSI CRC function, baseline version + * @returns 32 bit CRC + */ +unsigned int crc32_iscsi_base( + unsigned char *buffer, //!< buffer to calculate CRC on + int len, //!< buffer length in bytes + unsigned int crc_init //!< initial CRC value + ); + + +/** + * @brief Generate CRC from the T10 standard, runs baseline version + * @returns 16 bit CRC + */ +uint16_t crc16_t10dif_base( + uint16_t seed, //!< initial CRC value, 16 bits + uint8_t *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +/** + * @brief Generate CRC from the IEEE standard, runs baseline version + * @returns 32 bit CRC + */ +uint32_t crc32_ieee_base( + uint32_t seed, //!< initial CRC value, 32 bits + uint8_t *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +#ifdef __cplusplus +} +#endif + +#endif // _CRC_H_ diff --git a/src/isa-l/include/crc64.h b/src/isa-l/include/crc64.h new file mode 100644 index 00000000..8d7d81f9 --- /dev/null +++ b/src/isa-l/include/crc64.h @@ -0,0 +1,277 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +/** + * @file crc64.h + * @brief CRC64 functions. + */ + + +#ifndef _CRC64_H_ +#define _CRC64_H_ + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Multi-binary functions */ + +/** + * @brief Generate CRC from ECMA-182 standard in reflected format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_ecma_refl( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ECMA-182 standard in normal format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_ecma_norm( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in reflected format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_iso_refl( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in normal format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_iso_norm( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in reflected format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_jones_refl( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in normal format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_jones_norm( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/* Arch specific versions */ + +/** + * @brief Generate CRC from ECMA-182 standard in reflected format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_ecma_refl_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ECMA-182 standard in normal format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_ecma_norm_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ECMA-182 standard in reflected format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_ecma_refl_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ECMA-182 standard in normal format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_ecma_norm_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in reflected format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_iso_refl_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in normal format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_iso_norm_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in reflected format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_iso_refl_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in normal format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_iso_norm_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in reflected format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_jones_refl_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in normal format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_jones_norm_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in reflected format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_jones_refl_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in normal format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_jones_norm_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +#ifdef __cplusplus +} +#endif + +#endif // _CRC64_H_ diff --git a/src/isa-l/include/erasure_code.h b/src/isa-l/include/erasure_code.h new file mode 100644 index 00000000..570944c3 --- /dev/null +++ b/src/isa-l/include/erasure_code.h @@ -0,0 +1,933 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#ifndef _ERASURE_CODE_H_ +#define _ERASURE_CODE_H_ + +/** + * @file erasure_code.h + * @brief Interface to functions supporting erasure code encode and decode. + * + * This file defines the interface to optimized functions used in erasure + * codes. Encode and decode of erasures in GF(2^8) are made by calculating the + * dot product of the symbols (bytes in GF(2^8)) across a set of buffers and a + * set of coefficients. Values for the coefficients are determined by the type + * of erasure code. Using a general dot product means that any sequence of + * coefficients may be used including erasure codes based on random + * coefficients. + * Multiple versions of dot product are supplied to calculate 1-6 output + * vectors in one pass. + * Base GF multiply and divide functions can be sped up by defining + * GF_LARGE_TABLES at the expense of memory size. + * + */ + +#include "gf_vect_mul.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Initialize tables for fast Erasure Code encode and decode. + * + * Generates the expanded tables needed for fast encode or decode for erasure + * codes on blocks of data. 32bytes is generated for each input coefficient. + * + * @param k The number of vector sources or rows in the generator matrix + * for coding. + * @param rows The number of output vectors to concurrently encode/decode. + * @param a Pointer to sets of arrays of input coefficients used to encode + * or decode data. + * @param gftbls Pointer to start of space for concatenated output tables + * generated from input coefficients. Must be of size 32*k*rows. + * @returns none + */ + +void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls); + +/** + * @brief Generate or decode erasure codes on blocks of data, runs appropriate version. + * + * Given a list of source data blocks, generate one or multiple blocks of + * encoded data as specified by a matrix of GF(2^8) coefficients. When given a + * suitable set of coefficients, this function will perform the fast generation + * or decoding of Reed-Solomon type erasure codes. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param len Length of each block of data (vector) of source or dest data. + * @param k The number of vector sources or rows in the generator matrix + * for coding. + * @param rows The number of output vectors to concurrently encode/decode. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*k*rows + * @param data Array of pointers to source input buffers. + * @param coding Array of pointers to coded output buffers. + * @returns none + */ + +void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate or decode erasure codes on blocks of data. + * + * Arch specific version of ec_encode_data() with same parameters. + * @requires SSE4.1 + */ +void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate or decode erasure codes on blocks of data. + * + * Arch specific version of ec_encode_data() with same parameters. + * @requires AVX + */ +void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate or decode erasure codes on blocks of data. + * + * Arch specific version of ec_encode_data() with same parameters. + * @requires AVX2 + */ +void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate or decode erasure codes on blocks of data, runs baseline version. + * + * Baseline version of ec_encode_data() with same parameters. + */ +void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src, + unsigned char **dest); + +/** + * @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version. + * + * Given one source data block, update one or multiple blocks of encoded data as + * specified by a matrix of GF(2^8) coefficients. When given a suitable set of + * coefficients, this function will perform the fast generation or decoding of + * Reed-Solomon type erasure codes from one input source at a time. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param len Length of each block of data (vector) of source or dest data. + * @param k The number of vector sources or rows in the generator matrix + * for coding. + * @param rows The number of output vectors to concurrently encode/decode. + * @param vec_i The vector index corresponding to the single input source. + * @param g_tbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*k*rows + * @param data Pointer to single input source used to update output parity. + * @param coding Array of pointers to coded output buffers. + * @returns none + */ +void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Arch specific version of ec_encode_data_update() with same parameters. + * @requires SSE4.1 + */ + +void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Arch specific version of ec_encode_data_update() with same parameters. + * @requires AVX + */ + +void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Arch specific version of ec_encode_data_update() with same parameters. + * @requires AVX2 + */ + +void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Baseline version of ec_encode_data_update(). + */ + +void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v, + unsigned char *data, unsigned char **dest); + + +/** + * @brief GF(2^8) vector dot product. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product with two outputs. + * + * Vector dot product optimized to calculate two ouputs at a time. Does two + * GF(2^8) dot products across each byte of the input array and two constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 2*32*vlen byte constant array based on the two sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with two outputs. + * + * Vector dot product optimized to calculate two ouputs at a time. Does two + * GF(2^8) dot products across each byte of the input array and two constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 2*32*vlen byte constant array based on the two sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with two outputs. + * + * Vector dot product optimized to calculate two ouputs at a time. Does two + * GF(2^8) dot products across each byte of the input array and two constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 2*32*vlen byte constant array based on the two sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with three outputs. + * + * Vector dot product optimized to calculate three ouputs at a time. Does three + * GF(2^8) dot products across each byte of the input array and three constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 3*32*vlen byte constant array based on the three sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with three outputs. + * + * Vector dot product optimized to calculate three ouputs at a time. Does three + * GF(2^8) dot products across each byte of the input array and three constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 3*32*vlen byte constant array based on the three sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with three outputs. + * + * Vector dot product optimized to calculate three ouputs at a time. Does three + * GF(2^8) dot products across each byte of the input array and three constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 3*32*vlen byte constant array based on the three sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with four outputs. + * + * Vector dot product optimized to calculate four ouputs at a time. Does four + * GF(2^8) dot products across each byte of the input array and four constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 4*32*vlen byte constant array based on the four sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with four outputs. + * + * Vector dot product optimized to calculate four ouputs at a time. Does four + * GF(2^8) dot products across each byte of the input array and four constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 4*32*vlen byte constant array based on the four sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with four outputs. + * + * Vector dot product optimized to calculate four ouputs at a time. Does four + * GF(2^8) dot products across each byte of the input array and four constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 4*32*vlen byte constant array based on the four sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with five outputs. + * + * Vector dot product optimized to calculate five ouputs at a time. Does five + * GF(2^8) dot products across each byte of the input array and five constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 5*32*vlen byte constant array based on the five sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with five outputs. + * + * Vector dot product optimized to calculate five ouputs at a time. Does five + * GF(2^8) dot products across each byte of the input array and five constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 5*32*vlen byte constant array based on the five sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with five outputs. + * + * Vector dot product optimized to calculate five ouputs at a time. Does five + * GF(2^8) dot products across each byte of the input array and five constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 5*32*vlen byte constant array based on the five sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with six outputs. + * + * Vector dot product optimized to calculate six ouputs at a time. Does six + * GF(2^8) dot products across each byte of the input array and six constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 6*32*vlen byte constant array based on the six sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with six outputs. + * + * Vector dot product optimized to calculate six ouputs at a time. Does six + * GF(2^8) dot products across each byte of the input array and six constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 6*32*vlen byte constant array based on the six sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with six outputs. + * + * Vector dot product optimized to calculate six ouputs at a time. Does six + * GF(2^8) dot products across each byte of the input array and six constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 6*32*vlen byte constant array based on the six sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_6vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product, runs baseline version. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. Only elements 32*CONST*j + 1 + * of this array are used, where j = (0, 1, 2...) and CONST is the + * number of elements in the array of input coefficients. The + * elements used correspond to the original input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product, runs appropriate version. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + + +/** + * @brief GF(2^8) vector multiply accumulate, runs appropriate version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constant and add to destination array. Can be used for erasure coding encode + * and decode update when only one source is available at a time. Function + * requires pre-calculation of a 32*vec byte constant array based on the input + * coefficients. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, arch specific version. + * + * Arch specific version of gf_vect_mad() with same parameters. + * @requires SSE4.1 + */ + +void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); +/** + * @brief GF(2^8) vector multiply accumulate, arch specific version. + * + * Arch specific version of gf_vect_mad() with same parameters. + * @requires AVX + */ + +void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, arch specific version. + * + * Arch specific version of gf_vect_mad() with same parameters. + * @requires AVX2 + */ + +void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, baseline version. + * + * Baseline version of gf_vect_mad() with same parameters. + */ + +void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply with 2 accumulate. SSE version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse(). + * @requires AVX + */ +void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse(). + * @requires AVX2 + */ +void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. SSE version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse(). + * @requires AVX + */ +void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse(). + * @requires AVX2 + */ +void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 4 accumulate. SSE version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse(). + * @requires AVX + */ +void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse(). + * @requires AVX2 + */ +void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 5 accumulate. SSE version. + * @requires SSE4.1 + */ +void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 5 accumulate. AVX version. + * @requires AVX + */ +void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version. + * @requires AVX2 + */ +void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 6 accumulate. SSE version. + * @requires SSE4.1 + */ +void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 6 accumulate. AVX version. + * @requires AVX + */ +void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version. + * @requires AVX2 + */ +void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + + +/********************************************************************** + * The remaining are lib support functions used in GF(2^8) operations. + */ + +/** + * @brief Single element GF(2^8) multiply. + * + * @param a Multiplicand a + * @param b Multiplicand b + * @returns Product of a and b in GF(2^8) + */ + +unsigned char gf_mul(unsigned char a, unsigned char b); + +/** + * @brief Single element GF(2^8) inverse. + * + * @param a Input element + * @returns Field element b such that a x b = {1} + */ + +unsigned char gf_inv(unsigned char a); + +/** + * @brief Generate a matrix of coefficients to be used for encoding. + * + * Vandermonde matrix example of encoding coefficients where high portion of + * matrix is identity matrix I and lower portion is constructed as 2^{i*(j-k+1)} + * i:{0,k-1} j:{k,m-1}. Commonly used method for choosing coefficients in + * erasure encoding but does not guarantee invertable for every sub matrix. For + * large k it is possible to find cases where the decode matrix chosen from + * sources and parity not in erasure are not invertable. Users may want to + * adjust for k > 5. + * + * @param a [mxk] array to hold coefficients + * @param m number of rows in matrix corresponding to srcs + parity. + * @param k number of columns in matrix corresponding to srcs. + * @returns none + */ + +void gf_gen_rs_matrix(unsigned char *a, int m, int k); + +/** + * @brief Generate a Cauchy matrix of coefficients to be used for encoding. + * + * Cauchy matrix example of encoding coefficients where high portion of matrix + * is identity matrix I and lower portion is constructed as 1/(i + j) | i != j, + * i:{0,k-1} j:{k,m-1}. Any sub-matrix of a Cauchy matrix should be invertable. + * + * @param a [mxk] array to hold coefficients + * @param m number of rows in matrix corresponding to srcs + parity. + * @param k number of columns in matrix corresponding to srcs. + * @returns none + */ + +void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k); + +/** + * @brief Invert a matrix in GF(2^8) + * + * @param in input matrix + * @param out output matrix such that [in] x [out] = [I] - identity matrix + * @param n size of matrix [nxn] + * @returns 0 successful, other fail on singular input matrix + */ + +int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n); + + +/*************************************************************/ + +#ifdef __cplusplus +} +#endif + +#endif //_ERASURE_CODE_H_ diff --git a/src/isa-l/include/gf_vect_mul.h b/src/isa-l/include/gf_vect_mul.h new file mode 100644 index 00000000..42aa0a45 --- /dev/null +++ b/src/isa-l/include/gf_vect_mul.h @@ -0,0 +1,148 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#ifndef _GF_VECT_MUL_H +#define _GF_VECT_MUL_H + +/** + * @file gf_vect_mul.h + * @brief Interface to functions for vector (block) multiplication in GF(2^8). + * + * This file defines the interface to routines used in fast RAID rebuild and + * erasure codes. + */ + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * @brief GF(2^8) vector multiply by constant. + * + * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C + * is a single field element in GF(2^8). Can be used for RAID6 rebuild + * and partial write functions. Function requires pre-calculation of a + * 32-element constant array based on constant C. gftbl(C) = {C{00}, + * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len + * and src must be aligned to 32B. + * @requires SSE4.1 + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail + */ + +int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest); + + + /** + * @brief GF(2^8) vector multiply by constant. + * + * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C + * is a single field element in GF(2^8). Can be used for RAID6 rebuild + * and partial write functions. Function requires pre-calculation of a + * 32-element constant array based on constant C. gftbl(C) = {C{00}, + * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len + * and src must be aligned to 32B. + * @requires AVX + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail + */ + +int gf_vect_mul_avx(int len, unsigned char *gftbl, void *src, void *dest); + + +/** + * @brief GF(2^8) vector multiply by constant, runs appropriate version. + * + * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C + * is a single field element in GF(2^8). Can be used for RAID6 rebuild + * and partial write functions. Function requires pre-calculation of a + * 32-element constant array based on constant C. gftbl(C) = {C{00}, + * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. + * Len and src must be aligned to 32B. + * + * This function determines what instruction sets are enabled + * and selects the appropriate version at runtime. + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail + */ + +int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest); + + +/** + * @brief Initialize 32-byte constant array for GF(2^8) vector multiply + * + * Calculates array {C{00}, C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, + * C{20}, ... , C{f0} } as required by other fast vector multiply + * functions. + * @param c Constant input. + * @param gftbl Table output. + */ + +void gf_vect_mul_init(unsigned char c, unsigned char* gftbl); + + +/** + * @brief GF(2^8) vector multiply by constant, runs baseline version. + * + * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C + * is a single field element in GF(2^8). Can be used for RAID6 rebuild + * and partial write functions. Function requires pre-calculation of a + * 32-element constant array based on constant C. gftbl(C) = {C{00}, + * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len + * and src must be aligned to 32B. + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param a Pointer to 32-byte array of pre-calculated constants based on C. + * only use 2nd element is used. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + */ + +void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, + unsigned char *dest); + +#ifdef __cplusplus +} +#endif + +#endif //_GF_VECT_MUL_H diff --git a/src/isa-l/include/igzip_lib.h b/src/isa-l/include/igzip_lib.h new file mode 100644 index 00000000..3cba3faf --- /dev/null +++ b/src/isa-l/include/igzip_lib.h @@ -0,0 +1,636 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _IGZIP_H +#define _IGZIP_H + +/** + * @file igzip_lib.h + * + * @brief This file defines the igzip compression and decompression interface, a + * high performance deflate compression interface for storage applications. + * + * Deflate is a widely used compression standard that can be used standalone, it + * also forms the basis of gzip and zlib compression formats. Igzip supports the + * following flush features: + * + * - No Flush: The default method where no special flush is performed. + * + * - Sync flush: whereby isal_deflate() finishes the current deflate block at + * the end of each input buffer. The deflate block is byte aligned by + * appending an empty stored block. + * + * - Full flush: whereby isal_deflate() finishes and aligns the deflate block as + * in sync flush but also ensures that subsequent block's history does not + * look back beyond this point and new blocks are fully independent. + * + * Igzip also supports compression levels from ISAL_DEF_MIN_LEVEL to + * ISAL_DEF_MAX_LEVEL. + * + * Igzip contains some behaviour configurable at compile time. These + * configureable options are: + * + * - IGZIP_HIST_SIZE - Defines the window size. The default value is 32K (note K + * represents 1024), but 8K is also supported. Powers of 2 which are at most + * 32K may also work. + * + * - LONGER_HUFFTABLES - Defines whether to use a larger hufftables structure + * which may increase performance with smaller IGZIP_HIST_SIZE values. By + * default this optoin is not defined. This define sets IGZIP_HIST_SIZE to be + * 8 if IGZIP_HIST_SIZE > 8K. + * + * As an example, to compile gzip with an 8K window size, in a terminal run + * @verbatim gmake D="-D IGZIP_HIST_SIZE=8*1024" @endverbatim on Linux and + * FreeBSD, or with @verbatim nmake -f Makefile.nmake D="-D + * IGZIP_HIST_SIZE=8*1024" @endverbatim on Windows. + * + */ +#include <stdint.h> +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ +/* Deflate Compression Standard Defines */ +/******************************************************************************/ +#define IGZIP_K 1024 +#define ISAL_DEF_MAX_HDR_SIZE 328 +#define ISAL_DEF_MAX_CODE_LEN 15 +#define ISAL_DEF_HIST_SIZE (32*IGZIP_K) + +#define ISAL_DEF_LIT_SYMBOLS 257 +#define ISAL_DEF_LEN_SYMBOLS 29 +#define ISAL_DEF_DIST_SYMBOLS 30 +#define ISAL_DEF_LIT_LEN_SYMBOLS (ISAL_DEF_LIT_SYMBOLS + ISAL_DEF_LEN_SYMBOLS) + +#define ISAL_LOOK_AHEAD (18 * 16) /* Max repeat length, rounded up to 32 byte boundary */ + +/******************************************************************************/ +/* Deflate Implemenation Specific Defines */ +/******************************************************************************/ +/* Note IGZIP_HIST_SIZE must be a power of two */ +#ifndef IGZIP_HIST_SIZE +#define IGZIP_HIST_SIZE ISAL_DEF_HIST_SIZE +#endif + +#if (IGZIP_HIST_SIZE > ISAL_DEF_HIST_SIZE) +#undef IGZIP_HIST_SIZE +#define IGZIP_HIST_SIZE ISAL_DEF_HIST_SIZE +#endif + +#ifdef LONGER_HUFFTABLE +#if (IGZIP_HIST_SIZE > 8 * IGZIP_K) +#undef IGZIP_HIST_SIZE +#define IGZIP_HIST_SIZE (8 * IGZIP_K) +#endif +#endif + +#define ISAL_LIMIT_HASH_UPDATE + +#ifndef IGZIP_HASH_SIZE +#define IGZIP_HASH_SIZE (8 * IGZIP_K) +#endif + +#ifdef LONGER_HUFFTABLE +enum {IGZIP_DIST_TABLE_SIZE = 8*1024}; + +/* DECODE_OFFSET is dist code index corresponding to DIST_TABLE_SIZE + 1 */ +enum { IGZIP_DECODE_OFFSET = 26 }; +#else +enum {IGZIP_DIST_TABLE_SIZE = 2}; +/* DECODE_OFFSET is dist code index corresponding to DIST_TABLE_SIZE + 1 */ +enum { IGZIP_DECODE_OFFSET = 0 }; +#endif +enum {IGZIP_LEN_TABLE_SIZE = 256}; +enum {IGZIP_LIT_TABLE_SIZE = ISAL_DEF_LIT_SYMBOLS}; + +#define IGZIP_HUFFTABLE_CUSTOM 0 +#define IGZIP_HUFFTABLE_DEFAULT 1 +#define IGZIP_HUFFTABLE_STATIC 2 + +/* Flush Flags */ +#define NO_FLUSH 0 /* Default */ +#define SYNC_FLUSH 1 +#define FULL_FLUSH 2 +#define FINISH_FLUSH 0 /* Deprecated */ + +/* Gzip Flags */ +#define IGZIP_DEFLATE 0 /* Default */ +#define IGZIP_GZIP 1 +#define IGZIP_GZIP_NO_HDR 2 + +/* Compression Return values */ +#define COMP_OK 0 +#define INVALID_FLUSH -7 +#define INVALID_PARAM -8 +#define STATELESS_OVERFLOW -1 +#define ISAL_INVALID_OPERATION -9 +#define ISAL_INVALID_LEVEL -4 /* Invalid Compression level set */ + +/** + * @enum isal_zstate_state + * @brief Compression State please note ZSTATE_TRL only applies for GZIP compression + */ + + +/* When the state is set to ZSTATE_NEW_HDR or TMP_ZSTATE_NEW_HEADER, the + * hufftable being used for compression may be swapped + */ +enum isal_zstate_state { + ZSTATE_NEW_HDR, //!< Header to be written + ZSTATE_HDR, //!< Header state + ZSTATE_CREATE_HDR, //!< Header to be created + ZSTATE_BODY, //!< Body state + ZSTATE_FLUSH_READ_BUFFER, //!< Flush buffer + ZSTATE_FLUSH_ICF_BUFFER, + ZSTATE_SYNC_FLUSH, //!< Write sync flush block + ZSTATE_FLUSH_WRITE_BUFFER, //!< Flush bitbuf + ZSTATE_TRL, //!< Trailer state + ZSTATE_END, //!< End state + ZSTATE_TMP_NEW_HDR, //!< Temporary Header to be written + ZSTATE_TMP_HDR, //!< Temporary Header state + ZSTATE_TMP_CREATE_HDR, //!< Temporary Header to be created state + ZSTATE_TMP_BODY, //!< Temporary Body state + ZSTATE_TMP_FLUSH_READ_BUFFER, //!< Flush buffer + ZSTATE_TMP_FLUSH_ICF_BUFFER, + ZSTATE_TMP_SYNC_FLUSH, //!< Write sync flush block + ZSTATE_TMP_FLUSH_WRITE_BUFFER, //!< Flush bitbuf + ZSTATE_TMP_TRL, //!< Temporary Trailer state + ZSTATE_TMP_END //!< Temporary End state +}; + +/* Offset used to switch between TMP states and non-tmp states */ +#define ZSTATE_TMP_OFFSET ZSTATE_TMP_HDR - ZSTATE_HDR + +/******************************************************************************/ +/* Inflate Implementation Specific Defines */ +/******************************************************************************/ +#define ISAL_DECODE_LONG_BITS 12 +#define ISAL_DECODE_SHORT_BITS 10 + +/* Current state of decompression */ +enum isal_block_state { + ISAL_BLOCK_NEW_HDR, /* Just starting a new block */ + ISAL_BLOCK_HDR, /* In the middle of reading in a block header */ + ISAL_BLOCK_TYPE0, /* Decoding a type 0 block */ + ISAL_BLOCK_CODED, /* Decoding a huffman coded block */ + ISAL_BLOCK_INPUT_DONE, /* Decompression of input is completed */ + ISAL_BLOCK_FINISH /* Decompression of input is completed and all data has been flushed to output */ +}; + +/* Inflate Return values */ +#define ISAL_DECOMP_OK 0 /* No errors encountered while decompressing */ +#define ISAL_END_INPUT 1 /* End of input reached */ +#define ISAL_OUT_OVERFLOW 2 /* End of output reached */ +#define ISAL_INVALID_BLOCK -1 /* Invalid deflate block found */ +#define ISAL_INVALID_SYMBOL -2 /* Invalid deflate symbol found */ +#define ISAL_INVALID_LOOKBACK -3 /* Invalid lookback distance found */ + +/******************************************************************************/ +/* Compression structures */ +/******************************************************************************/ +/** @brief Holds histogram of deflate symbols*/ +struct isal_huff_histogram { + uint64_t lit_len_histogram[ISAL_DEF_LIT_LEN_SYMBOLS]; //!< Histogram of Literal/Len symbols seen + uint64_t dist_histogram[ISAL_DEF_DIST_SYMBOLS]; //!< Histogram of Distance Symbols seen + uint16_t hash_table[IGZIP_HASH_SIZE]; //!< Tmp space used as a hash table +}; + +struct isal_mod_hist { + uint32_t d_hist[30]; + uint32_t ll_hist[513]; +}; + +#define ISAL_DEF_MIN_LEVEL 0 +#define ISAL_DEF_MAX_LEVEL 1 + +/* Defines used set level data sizes */ +#define ISAL_DEF_LVL0_REQ 0 +#define ISAL_DEF_LVL1_REQ 4 * IGZIP_K /* has to be at least sizeof(struct level_2_buf) */ +#define ISAL_DEF_LVL1_TOKEN_SIZE 4 + +/* Data sizes for level specific data options */ +#define ISAL_DEF_LVL0_MIN ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_SMALL ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_MEDIUM ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_LARGE ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_EXTRA_LARGE ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_DEFAULT ISAL_DEF_LVL0_REQ + +#define ISAL_DEF_LVL1_MIN (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 1 * IGZIP_K) +#define ISAL_DEF_LVL1_SMALL (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 16 * IGZIP_K) +#define ISAL_DEF_LVL1_MEDIUM (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 32 * IGZIP_K) +#define ISAL_DEF_LVL1_LARGE (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 64 * IGZIP_K) +#define ISAL_DEF_LVL1_EXTRA_LARGE (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 128 * IGZIP_K) +#define ISAL_DEF_LVL1_DEFAULT ISAL_DEF_LVL1_LARGE + +/** @brief Holds Bit Buffer information*/ +struct BitBuf2 { + uint64_t m_bits; //!< bits in the bit buffer + uint32_t m_bit_count; //!< number of valid bits in the bit buffer + uint8_t *m_out_buf; //!< current index of buffer to write to + uint8_t *m_out_end; //!< end of buffer to write to + uint8_t *m_out_start; //!< start of buffer to write to +}; + +/* Variable prefixes: + * b_ : Measured wrt the start of the buffer + * f_ : Measured wrt the start of the file (aka file_start) + */ + +/** @brief Holds the internal state information for input and output compression streams*/ +struct isal_zstate { + uint32_t b_bytes_valid; //!< number of bytes of valid data in buffer + uint32_t b_bytes_processed; //!< keeps track of the number of bytes processed in isal_zstate.buffer + uint8_t *file_start; //!< pointer to where file would logically start + uint32_t crc; //!< Current crc + struct BitBuf2 bitbuf; //!< Bit Buffer + enum isal_zstate_state state; //!< Current state in processing the data stream + uint32_t count; //!< used for partial header/trailer writes + uint8_t tmp_out_buff[16]; //!< temporary array + uint32_t tmp_out_start; //!< temporary variable + uint32_t tmp_out_end; //!< temporary variable + uint32_t has_eob; //!< keeps track of eob on the last deflate block + uint32_t has_eob_hdr; //!< keeps track of eob hdr (with BFINAL set) + uint32_t has_hist; //!< flag to track if there is match history + + struct isal_mod_hist hist; + + DECLARE_ALIGNED(uint8_t buffer[2 * IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD], 32); //!< Internal buffer + DECLARE_ALIGNED(uint16_t head[IGZIP_HASH_SIZE], 16); //!< Hash array + +}; + +/** @brief Holds the huffman tree used to huffman encode the input stream **/ +struct isal_hufftables { + + uint8_t deflate_hdr[ISAL_DEF_MAX_HDR_SIZE]; //!< deflate huffman tree header + uint32_t deflate_hdr_count; //!< Number of whole bytes in deflate_huff_hdr + uint32_t deflate_hdr_extra_bits; //!< Number of bits in the partial byte in header + uint32_t dist_table[IGZIP_DIST_TABLE_SIZE]; //!< bits 4:0 are the code length, bits 31:5 are the code + uint32_t len_table[IGZIP_LEN_TABLE_SIZE]; //!< bits 4:0 are the code length, bits 31:5 are the code + uint16_t lit_table[IGZIP_LIT_TABLE_SIZE]; //!< literal code + uint8_t lit_table_sizes[IGZIP_LIT_TABLE_SIZE]; //!< literal code length + uint16_t dcodes[30 - IGZIP_DECODE_OFFSET]; //!< distance code + uint8_t dcodes_sizes[30 - IGZIP_DECODE_OFFSET]; //!< distance code length + +}; + +/** @brief Holds stream information*/ +struct isal_zstream { + uint8_t *next_in; //!< Next input byte + uint32_t avail_in; //!< number of bytes available at next_in + uint32_t total_in; //!< total number of bytes read so far + + uint8_t *next_out; //!< Next output byte + uint32_t avail_out; //!< number of bytes available at next_out + uint32_t total_out; //!< total number of bytes written so far + + struct isal_hufftables *hufftables; //!< Huffman encoding used when compressing + uint32_t level; //!< Compression level to use + uint32_t level_buf_size; //!< Size of level_buf + uint8_t * level_buf; //!< User allocated buffer required for different compression levels + uint32_t end_of_stream; //!< non-zero if this is the last input buffer + uint32_t flush; //!< Flush type can be NO_FLUSH, SYNC_FLUSH or FULL_FLUSH + uint32_t gzip_flag; //!< Indicate if gzip compression is to be performed + + struct isal_zstate internal_state; //!< Internal state for this stream +}; + +/******************************************************************************/ +/* Inflate structures */ +/******************************************************************************/ +/* + * Inflate_huff_code data structures are used to store a Huffman code for fast + * lookup. It works by performing a lookup in small_code_lookup that hopefully + * yields the correct symbol. Otherwise a lookup into long_code_lookup is + * performed to find the correct symbol. The details of how this works follows: + * + * Let i be some index into small_code_lookup and let e be the associated + * element. Bit 15 in e is a flag. If bit 15 is not set, then index i contains + * a Huffman code for a symbol which has length at most DECODE_LOOKUP_SIZE. Bits + * 0 through 8 are the symbol associated with that code and bits 9 through 12 of + * e represent the number of bits in the code. If bit 15 is set, the i + * corresponds to the first DECODE_LOOKUP_SIZE bits of a Huffman code which has + * length longer than DECODE_LOOKUP_SIZE. In this case, bits 0 through 8 + * represent an offset into long_code_lookup table and bits 9 through 12 + * represent the maximum length of a Huffman code starting with the bits in the + * index i. The offset into long_code_lookup is for an array associated with all + * codes which start with the bits in i. + * + * The elements of long_code_lookup are in the same format as small_code_lookup, + * except bit 15 is never set. Let i be a number made up of DECODE_LOOKUP_SIZE + * bits. Then all Huffman codes which start with DECODE_LOOKUP_SIZE bits are + * stored in an array starting at index h in long_code_lookup. This index h is + * stored in bits 0 through 9 at index i in small_code_lookup. The index j is an + * index of this array if the number of bits contained in j and i is the number + * of bits in the longest huff_code starting with the bits of i. The symbol + * stored at index j is the symbol whose huffcode can be found in (j << + * DECODE_LOOKUP_SIZE) | i. Note these arrays will be stored sorted in order of + * maximum Huffman code length. + * + * The following are explanations for sizes of the tables: + * + * Since small_code_lookup is a lookup on DECODE_LOOKUP_SIZE bits, it must have + * size 2^DECODE_LOOKUP_SIZE. + * + * Since deflate Huffman are stored such that the code size and the code value + * form an increasing function, At most 2^(15 - DECODE_LOOKUP_SIZE) - 1 elements + * of long_code_lookup duplicate an existing symbol. Since there are at most 285 + * - DECODE_LOOKUP_SIZE possible symbols contained in long_code lookup. Rounding + * this to the nearest 16 byte boundary yields the size of long_code_lookup of + * 288 + 2^(15 - DECODE_LOOKUP_SIZE). + * + * Note that DECODE_LOOKUP_SIZE can be any length even though the offset in + * small_lookup_code is 9 bits long because the increasing relationship between + * code length and code value forces the maximum offset to be less than 288. + */ + +/* Large lookup table for decoding huffman codes */ +struct inflate_huff_code_large { + uint16_t short_code_lookup[1 << (ISAL_DECODE_LONG_BITS)]; + uint16_t long_code_lookup[288 + (1 << (15 - ISAL_DECODE_LONG_BITS))]; +}; + +/* Small lookup table for decoding huffman codes */ +struct inflate_huff_code_small { + uint16_t short_code_lookup[1 << (ISAL_DECODE_SHORT_BITS)]; + uint16_t long_code_lookup[32 + (1 << (15 - ISAL_DECODE_SHORT_BITS))]; +}; + +/** @brief Holds decompression state information*/ +struct inflate_state { + uint8_t *next_out; //!< Next output Byte + uint32_t avail_out; //!< Number of bytes available at next_out + uint32_t total_out; //!< Total bytes written out so far + uint8_t *next_in; //!< Next input byte + uint64_t read_in; //!< Bits buffered to handle unaligned streams + uint32_t avail_in; //!< Number of bytes available at next_in + int32_t read_in_length; //!< Bits in read_in + struct inflate_huff_code_large lit_huff_code; //!< Structure for decoding lit/len symbols + struct inflate_huff_code_small dist_huff_code; //!< Structure for decoding dist symbols + enum isal_block_state block_state; //!< Current decompression state + uint32_t bfinal; //!< Flag identifying final block + uint32_t crc_flag; //!< Flag identifying whether to track of crc + uint32_t crc; //!< Contains crc of output if crc_flag is set + int32_t type0_block_len; //!< Length left to read of type 0 block when outbuffer overflow occured + int32_t copy_overflow_length; //!< Length left to copy when outbuffer overflow occured + int32_t copy_overflow_distance; //!< Lookback distance when outbuffer overlow occured + int32_t tmp_in_size; //!< Number of bytes in tmp_in_buffer + int32_t tmp_out_valid; //!< Number of bytes in tmp_out_buffer + int32_t tmp_out_processed; //!< Number of bytes processed in tmp_out_buffer + uint8_t tmp_in_buffer[ISAL_DEF_MAX_HDR_SIZE]; //!< Temporary buffer containing data from the input stream + uint8_t tmp_out_buffer[2 * ISAL_DEF_HIST_SIZE + ISAL_LOOK_AHEAD]; //!< Temporary buffer containing data from the output stream +}; + +/******************************************************************************/ +/* Compression functions */ +/******************************************************************************/ +/** + * @brief Updates histograms to include the symbols found in the input + * stream. Since this function only updates the histograms, it can be called on + * multiple streams to get a histogram better representing the desired data + * set. When first using histogram it must be initialized by zeroing the + * structure. + * + * @param in_stream: Input stream of data. + * @param length: The length of start_stream. + * @param histogram: The returned histogram of lit/len/dist symbols. + */ +void isal_update_histogram(uint8_t * in_stream, int length, struct isal_huff_histogram * histogram); + + +/** + * @brief Creates a custom huffman code for the given histograms in which + * every literal and repeat length is assigned a code and all possible lookback + * distances are assigned a code. + * + * @param hufftables: the output structure containing the huffman code + * @param histogram: histogram containing frequency of literal symbols, + * repeat lengths and lookback distances + * @returns Returns a non zero value if an invalid huffman code was created. + */ +int isal_create_hufftables(struct isal_hufftables * hufftables, + struct isal_huff_histogram * histogram); + +/** + * @brief Creates a custom huffman code for the given histograms like + * isal_create_hufftables() except literals with 0 frequency in the histogram + * are not assigned a code + * + * @param hufftables: the output structure containing the huffman code + * @param histogram: histogram containing frequency of literal symbols, + * repeat lengths and lookback distances + * @returns Returns a non zero value if an invalid huffman code was created. + */ +int isal_create_hufftables_subset(struct isal_hufftables * hufftables, + struct isal_huff_histogram * histogram); + +/** + * @brief Initialize compression stream data structure + * + * @param stream Structure holding state information on the compression streams. + * @returns none + */ +void isal_deflate_init(struct isal_zstream *stream); + +/** + * @brief Set stream to use a new Huffman code + * + * Sets the Huffman code to be used in compression before compression start or + * after the sucessful completion of a SYNC_FLUSH or FULL_FLUSH. If type has + * value IGZIP_HUFFTABLE_DEFAULT, the stream is set to use the default Huffman + * code. If type has value IGZIP_HUFFTABLE_STATIC, the stream is set to use the + * deflate standard static Huffman code, or if type has value + * IGZIP_HUFFTABLE_CUSTOM, the stream is set to sue the isal_hufftables + * structure input to isal_deflate_set_hufftables. + * + * @param stream: Structure holding state information on the compression stream. + * @param hufftables: new huffman code to use if type is set to + * IGZIP_HUFFTABLE_CUSTOM. + * @param type: Flag specifying what hufftable to use. + * + * @returns Returns INVALID_OPERATION if the stream was unmodified. This may be + * due to the stream being in a state where changing the huffman code is not + * allowed or an invalid input is provided. + */ +int isal_deflate_set_hufftables(struct isal_zstream *stream, + struct isal_hufftables *hufftables, int type); + +/** + * @brief Initialize compression stream data structure + * + * @param stream Structure holding state information on the compression streams. + * @returns none + */ +void isal_deflate_stateless_init(struct isal_zstream *stream); + + +/** + * @brief Fast data (deflate) compression for storage applications. + * + * The call to isal_deflate() will take data from the input buffer (updating + * next_in, avail_in and write a compressed stream to the output buffer + * (updating next_out and avail_out). The function returns when either the input + * buffer is empty or the output buffer is full. + * + * On entry to isal_deflate(), next_in points to an input buffer and avail_in + * indicates the length of that buffer. Similarly next_out points to an empty + * output buffer and avail_out indicates the size of that buffer. + * + * The fields total_in and total_out start at 0 and are updated by + * isal_deflate(). These reflect the total number of bytes read or written so far. + * + * When the last input buffer is passed in, signaled by setting the + * end_of_stream, the routine will complete compression at the end of the input + * buffer, as long as the output buffer is big enough. + * + * The compression level can be set by setting level to any value between + * ISAL_DEF_MIN_LEVEL and ISAL_DEF_MAX_LEVEL. When the compression level is + * ISAL_DEF_MIN_LEVEL, hufftables can be set to a table trained for the the + * specific data type being compressed to achieve better compression. When a + * higher compression level is desired, a larger generic memory buffer needs to + * be supplied by setting level_buf and level_buf_size to represent the chunk of + * memory. For level x, the suggest size for this buffer this buffer is + * ISAL_DEFL_LVLx_DEFAULT. The defines ISAL_DEFL_LVLx_MIN, ISAL_DEFL_LVLx_SMALL, + * ISAL_DEFL_LVLx_MEDIUM, ISAL_DEFL_LVLx_LARGE, and ISAL_DEFL_LVLx_EXTRA_LARGE + * are also provided as other suggested sizes. + * + * The equivalent of the zlib FLUSH_SYNC operation is currently supported. + * Flush types can be NO_FLUSH, SYNC_FLUSH or FULL_FLUSH. Default flush type is + * NO_FLUSH. A SYNC_ OR FULL_ flush will byte align the deflate block by + * appending an empty stored block once all input has been compressed, including + * the buffered input. Checking that the out_buffer is not empty or that + * internal_state.state = ZSTATE_NEW_HDR is sufficient to guarantee all input + * has been flushed. Additionally FULL_FLUSH will ensure look back history does + * not include previous blocks so new blocks are fully independent. Switching + * between flush types is supported. + * + * If the gzip_flag is set to IGZIP_GZIP, a generic gzip header and the gzip + * trailer are written around the deflate compressed data. If gzip_flag is set + * to IGZIP_GZIP_NO_HDR, then only the gzip trailer is written. + * + * @param stream Structure holding state information on the compression streams. + * @return COMP_OK (if everything is ok), + * INVALID_FLUSH (if an invalid FLUSH is selected), + * ISAL_INVALID_LEVEL (if an invalid compression level is selected). + */ +int isal_deflate(struct isal_zstream *stream); + + +/** + * @brief Fast data (deflate) stateless compression for storage applications. + * + * Stateless (one shot) compression routine with a similar interface to + * isal_deflate() but operates on entire input buffer at one time. Parameter + * avail_out must be large enough to fit the entire compressed output. Max + * expansion is limited to the input size plus the header size of a stored/raw + * block. + * + * When the compression level is set to 1, unlike in isal_deflate(), level_buf + * may be optionally set depending on what what permormance is desired. + * + * For stateless the flush types NO_FLUSH and FULL_FLUSH are supported. + * FULL_FLUSH will byte align the output deflate block so additional blocks can + * be easily appended. + * + * If the gzip_flag is set to IGZIP_GZIP, a generic gzip header and the gzip + * trailer are written around the deflate compressed data. If gzip_flag is set + * to IGZIP_GZIP_NO_HDR, then only the gzip trailer is written. + * + * @param stream Structure holding state information on the compression streams. + * @return COMP_OK (if everything is ok), + * INVALID_FLUSH (if an invalid FLUSH is selected), + * ISAL_INVALID_LEVEL (if an invalid compression level is selected), + * STATELESS_OVERFLOW (if output buffer will not fit output). + */ +int isal_deflate_stateless(struct isal_zstream *stream); + + +/******************************************************************************/ +/* Inflate functions */ +/******************************************************************************/ +/** + * @brief Initialize decompression state data structure + * + * @param state Structure holding state information on the compression streams. + * @returns none + */ +void isal_inflate_init(struct inflate_state *state); + +/** + * @brief Fast data (deflate) decompression for storage applications. + * + * On entry to isal_inflate(), next_in points to an input buffer and avail_in + * indicates the length of that buffer. Similarly next_out points to an empty + * output buffer and avail_out indicates the size of that buffer. + * + * The field total_out starts at 0 and is updated by isal_inflate(). This + * reflects the total number of bytes written so far. + * + * The call to isal_inflate() will take data from the input buffer (updating + * next_in, avail_in and write a decompressed stream to the output buffer + * (updating next_out and avail_out). The function returns when the input buffer + * is empty, the output buffer is full or invalid data is found. The current + * state of the decompression on exit can be read from state->block-state. If + * the crc_flag is set, the gzip crc of the output is stored in state->crc. + * + * @param state Structure holding state information on the compression streams. + * @return ISAL_DECOMP_OK (if everything is ok), + * ISAL_END_INPUT (if all input was decompressed), + * ISAL_OUT_OVERFLOW (if output buffer ran out of space), + * ISAL_INVALID_BLOCK, + * ISAL_INVALID_SYMBOL, + * ISAL_INVALID_LOOKBACK. + */ +int isal_inflate(struct inflate_state *state); + +/** + * @brief Fast data (deflate) stateless decompression for storage applications. + * + * Stateless (one shot) decompression routine with a similar interface to + * isal_inflate() but operates on entire input buffer at one time. Parameter + * avail_out must be large enough to fit the entire decompressed output. + * + * @param state Structure holding state information on the compression streams. + * @return ISAL_DECOMP_OK (if everything is ok), + * ISAL_END_INPUT (if all input was decompressed), + * ISAL_OUT_OVERFLOW (if output buffer ran out of space), + * ISAL_INVALID_BLOCK, + * ISAL_INVALID_SYMBOL, + * ISAL_INVALID_LOOKBACK. + */ +int isal_inflate_stateless(struct inflate_state *state); + +#ifdef __cplusplus +} +#endif +#endif /* ifndef _IGZIP_H */ diff --git a/src/isa-l/include/multibinary.asm b/src/isa-l/include/multibinary.asm new file mode 100644 index 00000000..7fca3a14 --- /dev/null +++ b/src/isa-l/include/multibinary.asm @@ -0,0 +1,307 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _MULTIBINARY_ASM_ +%define _MULTIBINARY_ASM_ + +%ifidn __OUTPUT_FORMAT__, elf32 + %define mbin_def_ptr dd + %define mbin_ptr_sz dword + %define mbin_rdi edi + %define mbin_rsi esi + %define mbin_rax eax + %define mbin_rbx ebx + %define mbin_rcx ecx + %define mbin_rdx edx +%else + %define mbin_def_ptr dq + %define mbin_ptr_sz qword + %define mbin_rdi rdi + %define mbin_rsi rsi + %define mbin_rax rax + %define mbin_rbx rbx + %define mbin_rcx rcx + %define mbin_rdx rdx +%endif + +;;;; +; multibinary macro: +; creates the visable entry point that uses HW optimized call pointer +; creates the init of the HW optimized call pointer +;;;; +%macro mbin_interface 1 + ;;;; + ; *_dispatched is defaulted to *_mbinit and replaced on first call. + ; Therefore, *_dispatch_init is only executed on first call. + ;;;; + section .data + %1_dispatched: + mbin_def_ptr %1_mbinit + + section .text + global %1:function + %1_mbinit: + ;;; only called the first time to setup hardware match + call %1_dispatch_init + ;;; falls thru to execute the hw optimized code + %1: + jmp mbin_ptr_sz [%1_dispatched] +%endmacro + +;;;;; +; mbin_dispatch_init parameters +; Use this function when SSE/00/01 is a minimum requirement +; 1-> function name +; 2-> SSE/00/01 optimized function used as base +; 3-> AVX or AVX/02 opt func +; 4-> AVX2 or AVX/04 opt func +;;;;; +%macro mbin_dispatch_init 4 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 + + mov eax, 1 + cpuid + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func + jne _%1_init_done ; AVX is not available so end + mov mbin_rsi, mbin_rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func + cmovne mbin_rsi, mbin_rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _%1_init_done + lea mbin_rsi, [%2 WRT_OPT] + + _%1_init_done: + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init2 parameters +; Cases where only base functions are available +; 1-> function name +; 2-> base function +;;;;; +%macro mbin_dispatch_init2 2 + section .text + %1_dispatch_init: + push mbin_rsi + lea mbin_rsi, [%2 WRT_OPT] ; Default + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init_clmul 3 parameters +; Use this case for CRC which needs both SSE4_1 and CLMUL +; 1-> function name +; 2-> base function +; 3-> SSE4_1 and CLMUL optimized function +;;;;; +%macro mbin_dispatch_init_clmul 3 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func + + ; Test for SSE4.2 + test ecx, FLAG_CPUID1_ECX_SSE4_1 + jz _%1_init_done + test ecx, FLAG_CPUID1_ECX_CLMUL + cmovne mbin_rsi, mbin_rbx + _%1_init_done: + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init5 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_2 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +;;;;; +%macro mbin_dispatch_init5 5 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + ; Test for SSE4.2 + test ecx, FLAG_CPUID1_ECX_SSE4_2 + lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func + cmovne mbin_rsi, mbin_rbx + + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func + jne _%1_init_done ; AVX is not available so end + mov mbin_rsi, mbin_rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func + cmovne mbin_rsi, mbin_rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _%1_init_done + lea mbin_rsi, [%3 WRT_OPT] + + _%1_init_done: + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init6 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_2 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +;;;;; +%macro mbin_dispatch_init6 6 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_2 + je _%1_init_done ; Use base function if no SSE4_2 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_ECX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_ECX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +%endif ; ifndef _MULTIBINARY_ASM_ diff --git a/src/isa-l/include/raid.h b/src/isa-l/include/raid.h new file mode 100644 index 00000000..192fca28 --- /dev/null +++ b/src/isa-l/include/raid.h @@ -0,0 +1,302 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#ifndef _RAID_H_ +#define _RAID_H_ + +/** + * @file raid.h + * @brief Interface to RAID functions - XOR and P+Q calculation. + * + * This file defines the interface to optimized XOR calculation (RAID5) or P+Q + * dual parity (RAID6). Operations are carried out on an array of pointers to + * sources and output arrays. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Multi-binary functions */ + +/** + * @brief Generate XOR parity vector from N sources, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest + * pointers must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int xor_gen(int vects, int len, void **array); + + +/** + * @brief Checks that array has XOR parity sum of 0 across all vectors, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to vectors. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_check(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 32B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int pq_gen(int vects, int len, void **array); + + +/** + * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of vectors in array including P&Q. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and P, Q. P and Q parity + * are assumed to be the last two pointers in the array. + * All pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_check(int vects, int len, void **array); + + +/* Arch specific versions */ + +/** + * @brief Generate XOR parity vector from N sources. + * @requires SSE4.1 + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_gen_sse(int vects, int len, void **array); + + +/** + * @brief Generate XOR parity vector from N sources. + * @requires AVX + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest pointers + * must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int xor_gen_avx(int vects, int len, void **array); + + +/** + * @brief Checks that array has XOR parity sum of 0 across all vectors. + * @requires SSE4.1 + * + * @param vects Number of vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to vectors. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_check_sse(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources. + * @requires SSE4.1 + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_sse(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources. + * @requires AVX + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_avx(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources. + * @requires AVX2 + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 32B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_avx2(int vects, int len, void **array); + + +/** + * @brief Checks that array of N sources, P and Q are consistent across all vectors. + * @requires SSE4.1 + * + * @param vects Number of vectors in array including P&Q. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and P, Q. P and Q parity + are assumed to be the last two pointers in the array. + All pointers must be aligned to 16B. + * @returns 0 pass, other fail + */ + +int pq_check_sse(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources, runs baseline version. + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_base(int vects, int len, void **array); + + +/** + * @brief Generate XOR parity vector from N sources, runs baseline version. + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest pointers + * must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int xor_gen_base(int vects, int len, void **array); + + +/** + * @brief Checks that array has XOR parity sum of 0 across all vectors, runs baseline version. + * + * @param vects Number of vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to vectors. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_check_base(int vects, int len, void **array); + + +/** + * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs baseline version. + * + * @param vects Number of vectors in array including P&Q. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and P, Q. P and Q parity + * are assumed to be the last two pointers in the array. + * All pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_check_base(int vects, int len, void **array); + +#ifdef __cplusplus +} +#endif + +#endif //_RAID_H_ diff --git a/src/isa-l/include/reg_sizes.asm b/src/isa-l/include/reg_sizes.asm new file mode 100644 index 00000000..cd689b7f --- /dev/null +++ b/src/isa-l/include/reg_sizes.asm @@ -0,0 +1,149 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _REG_SIZES_ASM_ +%define _REG_SIZES_ASM_ + +%ifdef __NASM_VER__ +%ifidn __OUTPUT_FORMAT__, win64 +%error nasm not supported in windows +%else +%define endproc_frame +%endif +%endif + +%define EFLAGS_HAS_CPUID (1<<21) +%define FLAG_CPUID1_ECX_CLMUL (1<<1) +%define FLAG_CPUID1_EDX_SSE2 (1<<26) +%define FLAG_CPUID1_ECX_SSE3 (1) +%define FLAG_CPUID1_ECX_SSE4_1 (1<<19) +%define FLAG_CPUID1_ECX_SSE4_2 (1<<20) +%define FLAG_CPUID1_ECX_POPCNT (1<<23) +%define FLAG_CPUID1_ECX_AESNI (1<<25) +%define FLAG_CPUID1_ECX_OSXSAVE (1<<27) +%define FLAG_CPUID1_ECX_AVX (1<<28) +%define FLAG_CPUID1_EBX_AVX2 (1<<5) + +%define FLAG_CPUID7_EBX_AVX2 (1<<5) +%define FLAG_CPUID7_EBX_AVX512F (1<<16) +%define FLAG_CPUID7_EBX_AVX512DQ (1<<17) +%define FLAG_CPUID7_EBX_AVX512IFMA (1<<21) +%define FLAG_CPUID7_EBX_AVX512PF (1<<26) +%define FLAG_CPUID7_EBX_AVX512ER (1<<27) +%define FLAG_CPUID7_EBX_AVX512CD (1<<28) +%define FLAG_CPUID7_EBX_AVX512BW (1<<30) +%define FLAG_CPUID7_EBX_AVX512VL (1<<31) +%define FLAG_CPUID7_ECX_AVX512VBMI (1<<1) + +%define FLAGS_CPUID7_ECX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ) + +%define FLAG_XGETBV_EAX_XMM (1<<1) +%define FLAG_XGETBV_EAX_YMM (1<<2) +%define FLAG_XGETBV_EAX_XMM_YMM 0x6 +%define FLAG_XGETBV_EAX_ZMM_OPM 0xe0 + +%define FLAG_CPUID1_EAX_AVOTON 0x000406d0 +%define FLAG_CPUID1_EAX_STEP_MASK 0xfffffff0 + +; define d and w variants for registers + +%define raxd eax +%define raxw ax +%define raxb al + +%define rbxd ebx +%define rbxw bx +%define rbxb bl + +%define rcxd ecx +%define rcxw cx +%define rcxb cl + +%define rdxd edx +%define rdxw dx +%define rdxb dl + +%define rsid esi +%define rsiw si +%define rsib sil + +%define rdid edi +%define rdiw di +%define rdib dil + +%define rbpd ebp +%define rbpw bp +%define rbpb bpl + +%define ymm0x xmm0 +%define ymm1x xmm1 +%define ymm2x xmm2 +%define ymm3x xmm3 +%define ymm4x xmm4 +%define ymm5x xmm5 +%define ymm6x xmm6 +%define ymm7x xmm7 +%define ymm8x xmm8 +%define ymm9x xmm9 +%define ymm10x xmm10 +%define ymm11x xmm11 +%define ymm12x xmm12 +%define ymm13x xmm13 +%define ymm14x xmm14 +%define ymm15x xmm15 + +%define DWORD(reg) reg %+ d +%define WORD(reg) reg %+ w +%define BYTE(reg) reg %+ b + +%define XWORD(reg) reg %+ x + +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__, macho64 +%define elf64 macho64 +%endif + +%macro slversion 4 + section .text + global %1_slver_%2%3%4 + global %1_slver + %1_slver: + %1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro + +%endif ; ifndef _REG_SIZES_ASM_ diff --git a/src/isa-l/include/test.h b/src/isa-l/include/test.h new file mode 100644 index 00000000..6f354b31 --- /dev/null +++ b/src/isa-l/include/test.h @@ -0,0 +1,81 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#ifndef _TEST_H +#define _TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Use sys/time.h functions for time + +#include <sys/time.h> + +struct perf{ + struct timeval tv; +}; + + +inline int perf_start(struct perf *p) +{ + return gettimeofday(&(p->tv), 0); +} +inline int perf_stop(struct perf *p) +{ + return gettimeofday(&(p->tv), 0); +} + +inline void perf_print(struct perf stop, struct perf start, long long dsize) +{ + long long secs = stop.tv.tv_sec - start.tv.tv_sec; + long long usecs = secs * 1000000 + stop.tv.tv_usec - start.tv.tv_usec; + + printf("runtime = %10lld usecs", usecs); + if (dsize != 0) { +#if 1 // not bug in printf for 32-bit + printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s\n", dsize/(1024*1024), + ((double) usecs)/1000000, ((double) dsize) / (double)usecs); +#else + printf(", bandwidth %lld MB ", dsize/(1024*1024)); + printf("in %.4f sec ",(double)usecs/1000000); + printf("= %.2f MB/s\n", (double)dsize/usecs); +#endif + } + else + printf("\n"); +} + + +#ifdef __cplusplus +} +#endif + +#endif // _TEST_H diff --git a/src/isa-l/include/types.h b/src/isa-l/include/types.h new file mode 100644 index 00000000..41d53554 --- /dev/null +++ b/src/isa-l/include/types.h @@ -0,0 +1,88 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +/** + * @file types.h + * @brief Defines standard width types. + * + */ + +#ifndef __TYPES_H +#define __TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 +#ifdef __MINGW32__ +# include <_mingw.h> +#endif +typedef unsigned __int64 UINT64; +typedef __int64 INT64; +typedef unsigned __int32 UINT32; +typedef unsigned __int16 UINT16; +typedef unsigned char UINT8; +#else +typedef unsigned long int UINT64; +typedef long int INT64; +typedef unsigned int UINT32; +typedef unsigned short int UINT16; +typedef unsigned char UINT8; +#endif + + +#if defined __unix__ || defined __APPLE__ +# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval))) +# define __forceinline static inline +# define aligned_free(x) free(x) +#else +# ifdef __MINGW32__ +# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval))) +# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn))) +# define aligned_free(x) _aligned_free(x) +# else +# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl +# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn))) +# define aligned_free(x) _aligned_free(x) +# endif +#endif + +#ifdef DEBUG +# define DEBUG_PRINT(x) printf x +#else +# define DEBUG_PRINT(x) do {} while (0) +#endif + +#ifdef __cplusplus +} +#endif + +#endif //__TYPES_H |