diff options
Diffstat (limited to 'src/isa-l/include')
-rw-r--r-- | src/isa-l/include/aarch64_multibinary.h | 311 | ||||
-rw-r--r-- | src/isa-l/include/crc.h | 212 | ||||
-rw-r--r-- | src/isa-l/include/crc64.h | 277 | ||||
-rw-r--r-- | src/isa-l/include/erasure_code.h | 947 | ||||
-rw-r--r-- | src/isa-l/include/gf_vect_mul.h | 152 | ||||
-rw-r--r-- | src/isa-l/include/igzip_lib.h | 990 | ||||
-rw-r--r-- | src/isa-l/include/mem_routines.h | 64 | ||||
-rw-r--r-- | src/isa-l/include/multibinary.asm | 440 | ||||
-rw-r--r-- | src/isa-l/include/raid.h | 305 | ||||
-rw-r--r-- | src/isa-l/include/reg_sizes.asm | 291 | ||||
-rw-r--r-- | src/isa-l/include/test.h | 285 | ||||
-rw-r--r-- | src/isa-l/include/types.h | 77 | ||||
-rw-r--r-- | src/isa-l/include/unaligned.h | 76 |
13 files changed, 4427 insertions, 0 deletions
diff --git a/src/isa-l/include/aarch64_multibinary.h b/src/isa-l/include/aarch64_multibinary.h new file mode 100644 index 000000000..e31451be6 --- /dev/null +++ b/src/isa-l/include/aarch64_multibinary.h @@ -0,0 +1,311 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#ifndef __AARCH64_MULTIBINARY_H__ +#define __AARCH64_MULTIBINARY_H__ +#ifndef __aarch64__ +#error "This file is for aarch64 only" +#endif +#include <asm/hwcap.h> +#ifdef __ASSEMBLY__ +/** + * # mbin_interface : the wrapper layer for isal-l api + * + * ## references: + * * https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=blob;f=sysdeps/aarch64/dl-trampoline.S + * * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf + * * https://static.docs.arm.com/ihi0057/b/IHI0057B_aadwarf64.pdf?_ga=2.80574487.1870739014.1564969896-1634778941.1548729310 + * + * ## Usage: + * 1. Define dispather function + * 2. name must be \name\()_dispatcher + * 3. Prototype should be *"void * \name\()_dispatcher"* + * 4. The dispather should return the right function pointer , revision and a string information . + **/ +.macro mbin_interface name:req + .extern \name\()_dispatcher + .section .data + .balign 8 + .global \name\()_dispatcher_info + .type \name\()_dispatcher_info,%object + + \name\()_dispatcher_info: + .quad \name\()_mbinit //func_entry + + .size \name\()_dispatcher_info,. - \name\()_dispatcher_info + + .balign 8 + .text + \name\()_mbinit: + //save lp fp, sub sp + .cfi_startproc + stp x29, x30, [sp, -224]! + + //add cfi directive to avoid GDB bt cmds error + //set cfi(Call Frame Information) + .cfi_def_cfa_offset 224 + .cfi_offset 29, -224 + .cfi_offset 30, -216 + + //save parameter/result/indirect result registers + stp x8, x9, [sp, 16] + .cfi_offset 8, -208 + .cfi_offset 9, -200 + stp x0, x1, [sp, 32] + .cfi_offset 0, -192 + .cfi_offset 1, -184 + stp x2, x3, [sp, 48] + .cfi_offset 2, -176 + .cfi_offset 3, -168 + stp x4, x5, [sp, 64] + .cfi_offset 4, -160 + .cfi_offset 5, -152 + stp x6, x7, [sp, 80] + .cfi_offset 6, -144 + .cfi_offset 7, -136 + stp q0, q1, [sp, 96] + .cfi_offset 64, -128 + .cfi_offset 65, -112 + stp q2, q3, [sp, 128] + .cfi_offset 66, -96 + .cfi_offset 67, -80 + stp q4, q5, [sp, 160] + .cfi_offset 68, -64 + .cfi_offset 69, -48 + stp q6, q7, [sp, 192] + .cfi_offset 70, -32 + .cfi_offset 71, -16 + + /** + * The dispatcher functions have the following prototype: + * void * function_dispatcher(void) + * As the dispatcher is returning a struct, by the AAPCS, + */ + + + bl \name\()_dispatcher + //restore temp/indirect result registers + ldp x8, x9, [sp, 16] + .cfi_restore 8 + .cfi_restore 9 + + // save function entry + str x0, [x9] + + //restore parameter/result registers + ldp x0, x1, [sp, 32] + .cfi_restore 0 + .cfi_restore 1 + ldp x2, x3, [sp, 48] + .cfi_restore 2 + .cfi_restore 3 + ldp x4, x5, [sp, 64] + .cfi_restore 4 + .cfi_restore 5 + ldp x6, x7, [sp, 80] + .cfi_restore 6 + .cfi_restore 7 + ldp q0, q1, [sp, 96] + .cfi_restore 64 + .cfi_restore 65 + ldp q2, q3, [sp, 128] + .cfi_restore 66 + .cfi_restore 67 + ldp q4, q5, [sp, 160] + .cfi_restore 68 + .cfi_restore 69 + ldp q6, q7, [sp, 192] + .cfi_restore 70 + .cfi_restore 71 + //save lp fp and sp + ldp x29, x30, [sp], 224 + //restore cfi setting + .cfi_restore 30 + .cfi_restore 29 + .cfi_def_cfa_offset 0 + .cfi_endproc + + .global \name + .type \name,%function + .align 2 + \name\(): + adrp x9, :got:\name\()_dispatcher_info + ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info] + ldr x10,[x9] + br x10 + .size \name,. - \name + +.endm + +/** + * mbin_interface_base is used for the interfaces which have only + * noarch implementation + */ +.macro mbin_interface_base name:req, base:req + .extern \base + .section .data + .balign 8 + .global \name\()_dispatcher_info + .type \name\()_dispatcher_info,%object + + \name\()_dispatcher_info: + .quad \base //func_entry + .size \name\()_dispatcher_info,. - \name\()_dispatcher_info + + .balign 8 + .text + .global \name + .type \name,%function + .align 2 + \name\(): + adrp x9, :got:\name\()_dispatcher_info + ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info] + ldr x10,[x9] + br x10 + .size \name,. - \name + +.endm + +#else /* __ASSEMBLY__ */ +#include <sys/auxv.h> + + + +#define DEFINE_INTERFACE_DISPATCHER(name) \ + void * name##_dispatcher(void) + +#define PROVIDER_BASIC(name) \ + PROVIDER_INFO(name##_base) + +#define DO_DIGNOSTIC(x) _Pragma GCC diagnostic ignored "-W"#x +#define DO_PRAGMA(x) _Pragma (#x) +#define DIGNOSTIC_IGNORE(x) DO_PRAGMA(GCC diagnostic ignored #x) +#define DIGNOSTIC_PUSH() DO_PRAGMA(GCC diagnostic push) +#define DIGNOSTIC_POP() DO_PRAGMA(GCC diagnostic pop) + + +#define PROVIDER_INFO(_func_entry) \ + ({ DIGNOSTIC_PUSH() \ + DIGNOSTIC_IGNORE(-Wnested-externs) \ + extern void _func_entry(void); \ + DIGNOSTIC_POP() \ + _func_entry; \ + }) + +/** + * Micro-Architector definitions + * Reference: https://developer.arm.com/docs/ddi0595/f/aarch64-system-registers/midr_el1 + */ + +#define CPU_IMPLEMENTER_RESERVE 0x00 +#define CPU_IMPLEMENTER_ARM 0x41 + + +#define CPU_PART_CORTEX_A57 0xD07 +#define CPU_PART_CORTEX_A72 0xD08 +#define CPU_PART_NEOVERSE_N1 0xD0C + +#define MICRO_ARCH_ID(imp,part) \ + (((CPU_IMPLEMENTER_##imp&0xff)<<24)|((CPU_PART_##part&0xfff)<<4)) + +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1<<11) +#endif + +/** + * @brief get_micro_arch_id + * + * read micro-architector register instruction if possible.This function + * provides microarchitecture information and make microarchitecture optimization + * possible. + * + * Read system registers(MRS) is forbidden in userspace. If executed, it + * will raise illegal instruction error. Kernel provides a solution for + * this issue. The solution depends on HWCAP_CPUID flags. Reference(1) + * describes how to use it. It provides a "illegal insstruction" handler + * in kernel space, the handler will execute MRS and return the correct + * value to userspace. + * + * To avoid too many kernel trap, this function MUST be only called in + * dispatcher. And HWCAP must be match,That will make sure there are no + * illegal instruction errors. HWCAP_CPUID should be available to get the + * best performance. + * + * NOTICE: + * - HWCAP_CPUID should be available. Otherwise it returns reserve value + * - It MUST be called inside dispather. + * - It MUST meet the HWCAP requirements + * + * Example: + * DEFINE_INTERFACE_DISPATCHER(crc32_iscsi) + * { + * unsigned long auxval = getauxval(AT_HWCAP); + * // MUST do the judgement is MUST. + * if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) { + * switch (get_micro_arch_id()) { + * case MICRO_ARCH_ID(ARM, CORTEX_A57): + * return PROVIDER_INFO(crc32_pmull_crc_for_a57); + * case MICRO_ARCH_ID(ARM, CORTEX_A72): + * return PROVIDER_INFO(crc32_pmull_crc_for_a72); + * case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + * return PROVIDER_INFO(crc32_pmull_crc_for_n1); + * case default: + * return PROVIDER_INFO(crc32_pmull_crc_for_others); + * } + * } + * return PROVIDER_BASIC(crc32_iscsi); + * } + * KNOWN ISSUE: + * On a heterogeneous system (big.LITTLE), it will work but the performance + * might not be the best one as expected. + * + * If this function is called on the big core, it will return the function + * optimized for the big core. + * + * If execution is then scheduled to the little core. It will still work (1), + * but the function won't be optimized for the little core, thus the performance + * won't be as expected. + * + * References: + * - [CPU Feature detection](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/arm64/cpu-feature-registers.rst?h=v5.5) + * + */ +static inline uint32_t get_micro_arch_id(void) +{ + uint32_t id=CPU_IMPLEMENTER_RESERVE; + if ((getauxval(AT_HWCAP) & HWCAP_CPUID)) { + /** Here will trap into kernel space */ + asm("mrs %0, MIDR_EL1 " : "=r" (id)); + } + return id&0xff00fff0; +} + + + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/src/isa-l/include/crc.h b/src/isa-l/include/crc.h new file mode 100644 index 000000000..071496083 --- /dev/null +++ b/src/isa-l/include/crc.h @@ -0,0 +1,212 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +/** + * @file crc.h + * @brief CRC functions. + */ + + +#ifndef _CRC_H_ +#define _CRC_H_ + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Multi-binary functions */ + +/** + * @brief Generate CRC from the T10 standard, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @returns 16 bit CRC + */ +uint16_t crc16_t10dif( + uint16_t init_crc, //!< initial CRC value, 16 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +/** + * @brief Generate CRC and copy T10 standard, runs appropriate version. + * + * Stitched CRC + copy function. + * + * @returns 16 bit CRC + */ +uint16_t crc16_t10dif_copy( + uint16_t init_crc, //!< initial CRC value, 16 bits + uint8_t *dst, //!< buffer destination for copy + uint8_t *src, //!< buffer source to crc + copy + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +/** + * @brief Generate CRC from the IEEE standard, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * Note: CRC32 IEEE standard is widely used in HDLC, Ethernet, Gzip and + * many others. Its polynomial is 0x04C11DB7 in normal and 0xEDB88320 + * in reflection (or reverse). In ISA-L CRC, function crc32_ieee is + * actually designed for normal CRC32 IEEE version. And function + * crc32_gzip_refl is actually designed for reflected CRC32 IEEE. + * These two versions of CRC32 IEEE are not compatible with each other. + * Users who want to replace their not optimized crc32 ieee with ISA-L's + * crc32 function should be careful of that. + * Since many applications use CRC32 IEEE reflected version, Please have + * a check whether crc32_gzip_refl is right one for you instead of + * crc32_ieee. + * + * @returns 32 bit CRC + */ + +uint32_t crc32_ieee( + uint32_t init_crc, //!< initial CRC value, 32 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate the customized CRC + * based on RFC 1952 CRC (http://www.ietf.org/rfc/rfc1952.txt) standard, + * runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * Note: CRC32 IEEE standard is widely used in HDLC, Ethernet, Gzip and + * many others. Its polynomial is 0x04C11DB7 in normal and 0xEDB88320 + * in reflection (or reverse). In ISA-L CRC, function crc32_ieee is + * actually designed for normal CRC32 IEEE version. And function + * crc32_gzip_refl is actually designed for reflected CRC32 IEEE. + * These two versions of CRC32 IEEE are not compatible with each other. + * Users who want to replace their not optimized crc32 ieee with ISA-L's + * crc32 function should be careful of that. + * Since many applications use CRC32 IEEE reflected version, Please have + * a check whether crc32_gzip_refl is right one for you instead of + * crc32_ieee. + * + * @returns 32 bit CRC + */ +uint32_t crc32_gzip_refl( + uint32_t init_crc, //!< initial CRC value, 32 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +/** + * @brief ISCSI CRC function, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @returns 32 bit CRC + */ +unsigned int crc32_iscsi( + unsigned char *buffer, //!< buffer to calculate CRC on + int len, //!< buffer length in bytes + unsigned int init_crc //!< initial CRC value + ); + + +/* Base functions */ + +/** + * @brief ISCSI CRC function, baseline version + * @returns 32 bit CRC + */ +unsigned int crc32_iscsi_base( + unsigned char *buffer, //!< buffer to calculate CRC on + int len, //!< buffer length in bytes + unsigned int crc_init //!< initial CRC value + ); + + +/** + * @brief Generate CRC from the T10 standard, runs baseline version + * @returns 16 bit CRC + */ +uint16_t crc16_t10dif_base( + uint16_t seed, //!< initial CRC value, 16 bits + uint8_t *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +/** + * @brief Generate CRC and copy T10 standard, runs baseline version. + * @returns 16 bit CRC + */ +uint16_t crc16_t10dif_copy_base( + uint16_t init_crc, //!< initial CRC value, 16 bits + uint8_t *dst, //!< buffer destination for copy + uint8_t *src, //!< buffer source to crc + copy + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +/** + * @brief Generate CRC from the IEEE standard, runs baseline version + * @returns 32 bit CRC + */ +uint32_t crc32_ieee_base( + uint32_t seed, //!< initial CRC value, 32 bits + uint8_t *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate the customized CRC + * based on RFC 1952 CRC (http://www.ietf.org/rfc/rfc1952.txt) standard, + * runs baseline version + * @returns 32 bit CRC + */ +uint32_t crc32_gzip_refl_base( + uint32_t seed, //!< initial CRC value, 32 bits + uint8_t *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + + +#ifdef __cplusplus +} +#endif + +#endif // _CRC_H_ diff --git a/src/isa-l/include/crc64.h b/src/isa-l/include/crc64.h new file mode 100644 index 000000000..d0e02748c --- /dev/null +++ b/src/isa-l/include/crc64.h @@ -0,0 +1,277 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +/** + * @file crc64.h + * @brief CRC64 functions. + */ + + +#ifndef _CRC64_H_ +#define _CRC64_H_ + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Multi-binary functions */ + +/** + * @brief Generate CRC from ECMA-182 standard in reflected format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_ecma_refl( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ECMA-182 standard in normal format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_ecma_norm( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in reflected format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_iso_refl( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in normal format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_iso_norm( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in reflected format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_jones_refl( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in normal format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_jones_norm( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/* Arch specific versions */ + +/** + * @brief Generate CRC from ECMA-182 standard in reflected format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_ecma_refl_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ECMA-182 standard in normal format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_ecma_norm_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ECMA-182 standard in reflected format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_ecma_refl_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ECMA-182 standard in normal format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_ecma_norm_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in reflected format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_iso_refl_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in normal format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_iso_norm_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in reflected format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_iso_refl_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in normal format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_iso_norm_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in reflected format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_jones_refl_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in normal format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_jones_norm_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in reflected format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_jones_refl_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in normal format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_jones_norm_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +#ifdef __cplusplus +} +#endif + +#endif // _CRC64_H_ diff --git a/src/isa-l/include/erasure_code.h b/src/isa-l/include/erasure_code.h new file mode 100644 index 000000000..2f9a257e5 --- /dev/null +++ b/src/isa-l/include/erasure_code.h @@ -0,0 +1,947 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#ifndef _ERASURE_CODE_H_ +#define _ERASURE_CODE_H_ + +/** + * @file erasure_code.h + * @brief Interface to functions supporting erasure code encode and decode. + * + * This file defines the interface to optimized functions used in erasure + * codes. Encode and decode of erasures in GF(2^8) are made by calculating the + * dot product of the symbols (bytes in GF(2^8)) across a set of buffers and a + * set of coefficients. Values for the coefficients are determined by the type + * of erasure code. Using a general dot product means that any sequence of + * coefficients may be used including erasure codes based on random + * coefficients. + * Multiple versions of dot product are supplied to calculate 1-6 output + * vectors in one pass. + * Base GF multiply and divide functions can be sped up by defining + * GF_LARGE_TABLES at the expense of memory size. + * + */ + +#include "gf_vect_mul.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Initialize tables for fast Erasure Code encode and decode. + * + * Generates the expanded tables needed for fast encode or decode for erasure + * codes on blocks of data. 32bytes is generated for each input coefficient. + * + * @param k The number of vector sources or rows in the generator matrix + * for coding. + * @param rows The number of output vectors to concurrently encode/decode. + * @param a Pointer to sets of arrays of input coefficients used to encode + * or decode data. + * @param gftbls Pointer to start of space for concatenated output tables + * generated from input coefficients. Must be of size 32*k*rows. + * @returns none + */ + +void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls); + +/** + * @brief Generate or decode erasure codes on blocks of data, runs appropriate version. + * + * Given a list of source data blocks, generate one or multiple blocks of + * encoded data as specified by a matrix of GF(2^8) coefficients. When given a + * suitable set of coefficients, this function will perform the fast generation + * or decoding of Reed-Solomon type erasure codes. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param len Length of each block of data (vector) of source or dest data. + * @param k The number of vector sources or rows in the generator matrix + * for coding. + * @param rows The number of output vectors to concurrently encode/decode. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*k*rows + * @param data Array of pointers to source input buffers. + * @param coding Array of pointers to coded output buffers. + * @returns none + */ + +void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate or decode erasure codes on blocks of data, runs baseline version. + * + * Baseline version of ec_encode_data() with same parameters. + */ +void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src, + unsigned char **dest); + +/** + * @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version. + * + * Given one source data block, update one or multiple blocks of encoded data as + * specified by a matrix of GF(2^8) coefficients. When given a suitable set of + * coefficients, this function will perform the fast generation or decoding of + * Reed-Solomon type erasure codes from one input source at a time. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param len Length of each block of data (vector) of source or dest data. + * @param k The number of vector sources or rows in the generator matrix + * for coding. + * @param rows The number of output vectors to concurrently encode/decode. + * @param vec_i The vector index corresponding to the single input source. + * @param g_tbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*k*rows + * @param data Pointer to single input source used to update output parity. + * @param coding Array of pointers to coded output buffers. + * @returns none + */ +void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Baseline version of ec_encode_data_update(). + */ + +void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v, + unsigned char *data, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product, runs baseline version. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. Only elements 32*CONST*j + 1 + * of this array are used, where j = (0, 1, 2...) and CONST is the + * number of elements in the array of input coefficients. The + * elements used correspond to the original input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + + +void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product, runs appropriate version. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, runs appropriate version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constant and add to destination array. Can be used for erasure coding encode + * and decode update when only one source is available at a time. Function + * requires pre-calculation of a 32*vec byte constant array based on the input + * coefficients. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param len Length of each vector in bytes. Must be >= 64. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, baseline version. + * + * Baseline version of gf_vect_mad() with same parameters. + */ + +void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src, + unsigned char *dest); + +// x86 only +#if defined(__i386__) || defined(__x86_64__) + +/** + * @brief Generate or decode erasure codes on blocks of data. + * + * Arch specific version of ec_encode_data() with same parameters. + * @requires SSE4.1 + */ +void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate or decode erasure codes on blocks of data. + * + * Arch specific version of ec_encode_data() with same parameters. + * @requires AVX + */ +void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate or decode erasure codes on blocks of data. + * + * Arch specific version of ec_encode_data() with same parameters. + * @requires AVX2 + */ +void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Arch specific version of ec_encode_data_update() with same parameters. + * @requires SSE4.1 + */ + +void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Arch specific version of ec_encode_data_update() with same parameters. + * @requires AVX + */ + +void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Arch specific version of ec_encode_data_update() with same parameters. + * @requires AVX2 + */ + +void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief GF(2^8) vector dot product. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product. + * + * Does a GF(2^8) dot product across each byte of the input array and a constant + * set of coefficients to produce each byte of the output. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 32*vlen byte constant array based on the input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based + * on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector dot product with two outputs. + * + * Vector dot product optimized to calculate two outputs at a time. Does two + * GF(2^8) dot products across each byte of the input array and two constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 2*32*vlen byte constant array based on the two sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with two outputs. + * + * Vector dot product optimized to calculate two outputs at a time. Does two + * GF(2^8) dot products across each byte of the input array and two constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 2*32*vlen byte constant array based on the two sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with two outputs. + * + * Vector dot product optimized to calculate two outputs at a time. Does two + * GF(2^8) dot products across each byte of the input array and two constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 2*32*vlen byte constant array based on the two sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with three outputs. + * + * Vector dot product optimized to calculate three outputs at a time. Does three + * GF(2^8) dot products across each byte of the input array and three constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 3*32*vlen byte constant array based on the three sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with three outputs. + * + * Vector dot product optimized to calculate three outputs at a time. Does three + * GF(2^8) dot products across each byte of the input array and three constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 3*32*vlen byte constant array based on the three sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with three outputs. + * + * Vector dot product optimized to calculate three outputs at a time. Does three + * GF(2^8) dot products across each byte of the input array and three constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 3*32*vlen byte constant array based on the three sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with four outputs. + * + * Vector dot product optimized to calculate four outputs at a time. Does four + * GF(2^8) dot products across each byte of the input array and four constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 4*32*vlen byte constant array based on the four sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with four outputs. + * + * Vector dot product optimized to calculate four outputs at a time. Does four + * GF(2^8) dot products across each byte of the input array and four constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 4*32*vlen byte constant array based on the four sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with four outputs. + * + * Vector dot product optimized to calculate four outputs at a time. Does four + * GF(2^8) dot products across each byte of the input array and four constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 4*32*vlen byte constant array based on the four sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with five outputs. + * + * Vector dot product optimized to calculate five outputs at a time. Does five + * GF(2^8) dot products across each byte of the input array and five constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 5*32*vlen byte constant array based on the five sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with five outputs. + * + * Vector dot product optimized to calculate five outputs at a time. Does five + * GF(2^8) dot products across each byte of the input array and five constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 5*32*vlen byte constant array based on the five sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with five outputs. + * + * Vector dot product optimized to calculate five outputs at a time. Does five + * GF(2^8) dot products across each byte of the input array and five constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 5*32*vlen byte constant array based on the five sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with six outputs. + * + * Vector dot product optimized to calculate six outputs at a time. Does six + * GF(2^8) dot products across each byte of the input array and six constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 6*32*vlen byte constant array based on the six sets of input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with six outputs. + * + * Vector dot product optimized to calculate six outputs at a time. Does six + * GF(2^8) dot products across each byte of the input array and six constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 6*32*vlen byte constant array based on the six sets of input coefficients. + * @requires AVX + * + * @param len Length of each vector in bytes. Must be >= 16. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector dot product with six outputs. + * + * Vector dot product optimized to calculate six outputs at a time. Does six + * GF(2^8) dot products across each byte of the input array and six constant + * sets of coefficients to produce each byte of the outputs. Can be used for + * erasure coding encode and decode. Function requires pre-calculation of a + * 6*32*vlen byte constant array based on the six sets of input coefficients. + * @requires AVX2 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vlen Number of vector sources. + * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants + * based on the array of input coefficients. + * @param src Array of pointers to source inputs. + * @param dest Array of pointers to destination data buffers. + * @returns none + */ + +void gf_6vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply accumulate, arch specific version. + * + * Arch specific version of gf_vect_mad() with same parameters. + * @requires SSE4.1 + */ + +void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); +/** + * @brief GF(2^8) vector multiply accumulate, arch specific version. + * + * Arch specific version of gf_vect_mad() with same parameters. + * @requires AVX + */ + +void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, arch specific version. + * + * Arch specific version of gf_vect_mad() with same parameters. + * @requires AVX2 + */ + +void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); + + +/** + * @brief GF(2^8) vector multiply with 2 accumulate. SSE version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse(). + * @requires AVX + */ +void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse(). + * @requires AVX2 + */ +void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. SSE version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse(). + * @requires AVX + */ +void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse(). + * @requires AVX2 + */ +void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 4 accumulate. SSE version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse(). + * @requires AVX + */ +void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse(). + * @requires AVX2 + */ +void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 5 accumulate. SSE version. + * @requires SSE4.1 + */ +void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 5 accumulate. AVX version. + * @requires AVX + */ +void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version. + * @requires AVX2 + */ +void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 6 accumulate. SSE version. + * @requires SSE4.1 + */ +void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 6 accumulate. AVX version. + * @requires AVX + */ +void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version. + * @requires AVX2 + */ +void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +#endif + +/********************************************************************** + * The remaining are lib support functions used in GF(2^8) operations. + */ + +/** + * @brief Single element GF(2^8) multiply. + * + * @param a Multiplicand a + * @param b Multiplicand b + * @returns Product of a and b in GF(2^8) + */ + +unsigned char gf_mul(unsigned char a, unsigned char b); + +/** + * @brief Single element GF(2^8) inverse. + * + * @param a Input element + * @returns Field element b such that a x b = {1} + */ + +unsigned char gf_inv(unsigned char a); + +/** + * @brief Generate a matrix of coefficients to be used for encoding. + * + * Vandermonde matrix example of encoding coefficients where high portion of + * matrix is identity matrix I and lower portion is constructed as 2^{i*(j-k+1)} + * i:{0,k-1} j:{k,m-1}. Commonly used method for choosing coefficients in + * erasure encoding but does not guarantee invertable for every sub matrix. For + * large pairs of m and k it is possible to find cases where the decode matrix + * chosen from sources and parity is not invertable. Users may want to adjust + * for certain pairs m and k. If m and k satisfy one of the following + * inequalities, no adjustment is required: + * + * - k <= 3 + * - k = 4, m <= 25 + * - k = 5, m <= 10 + * - k <= 21, m-k = 4 + * - m - k <= 3. + * + * @param a [m x k] array to hold coefficients + * @param m number of rows in matrix corresponding to srcs + parity. + * @param k number of columns in matrix corresponding to srcs. + * @returns none + */ + +void gf_gen_rs_matrix(unsigned char *a, int m, int k); + +/** + * @brief Generate a Cauchy matrix of coefficients to be used for encoding. + * + * Cauchy matrix example of encoding coefficients where high portion of matrix + * is identity matrix I and lower portion is constructed as 1/(i + j) | i != j, + * i:{0,k-1} j:{k,m-1}. Any sub-matrix of a Cauchy matrix should be invertable. + * + * @param a [m x k] array to hold coefficients + * @param m number of rows in matrix corresponding to srcs + parity. + * @param k number of columns in matrix corresponding to srcs. + * @returns none + */ + +void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k); + +/** + * @brief Invert a matrix in GF(2^8) + * + * Attempts to construct an n x n inverse of the input matrix. Returns non-zero + * if singular. Will always destroy input matrix in process. + * + * @param in input matrix, destroyed by invert process + * @param out output matrix such that [in] x [out] = [I] - identity matrix + * @param n size of matrix [nxn] + * @returns 0 successful, other fail on singular input matrix + */ + +int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n); + + +/*************************************************************/ + +#ifdef __cplusplus +} +#endif + +#endif //_ERASURE_CODE_H_ diff --git a/src/isa-l/include/gf_vect_mul.h b/src/isa-l/include/gf_vect_mul.h new file mode 100644 index 000000000..70a0ab2ed --- /dev/null +++ b/src/isa-l/include/gf_vect_mul.h @@ -0,0 +1,152 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#ifndef _GF_VECT_MUL_H +#define _GF_VECT_MUL_H + +/** + * @file gf_vect_mul.h + * @brief Interface to functions for vector (block) multiplication in GF(2^8). + * + * This file defines the interface to routines used in fast RAID rebuild and + * erasure codes. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +// x86 only +#if defined(__i386__) || defined(__x86_64__) + + /** + * @brief GF(2^8) vector multiply by constant. + * + * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C + * is a single field element in GF(2^8). Can be used for RAID6 rebuild + * and partial write functions. Function requires pre-calculation of a + * 32-element constant array based on constant C. gftbl(C) = {C{00}, + * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len + * and src must be aligned to 32B. + * @requires SSE4.1 + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail + */ + +int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest); + + + /** + * @brief GF(2^8) vector multiply by constant. + * + * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C + * is a single field element in GF(2^8). Can be used for RAID6 rebuild + * and partial write functions. Function requires pre-calculation of a + * 32-element constant array based on constant C. gftbl(C) = {C{00}, + * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len + * and src must be aligned to 32B. + * @requires AVX + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail + */ + +int gf_vect_mul_avx(int len, unsigned char *gftbl, void *src, void *dest); + +#endif + +/** + * @brief GF(2^8) vector multiply by constant, runs appropriate version. + * + * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C + * is a single field element in GF(2^8). Can be used for RAID6 rebuild + * and partial write functions. Function requires pre-calculation of a + * 32-element constant array based on constant C. gftbl(C) = {C{00}, + * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. + * Len and src must be aligned to 32B. + * + * This function determines what instruction sets are enabled + * and selects the appropriate version at runtime. + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail + */ + +int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest); + + +/** + * @brief Initialize 32-byte constant array for GF(2^8) vector multiply + * + * Calculates array {C{00}, C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, + * C{20}, ... , C{f0} } as required by other fast vector multiply + * functions. + * @param c Constant input. + * @param gftbl Table output. + */ + +void gf_vect_mul_init(unsigned char c, unsigned char* gftbl); + + +/** + * @brief GF(2^8) vector multiply by constant, runs baseline version. + * + * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C + * is a single field element in GF(2^8). Can be used for RAID6 rebuild + * and partial write functions. Function requires pre-calculation of a + * 32-element constant array based on constant C. gftbl(C) = {C{00}, + * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len + * and src must be aligned to 32B. + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param a Pointer to 32-byte array of pre-calculated constants based on C. + * only use 2nd element is used. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + */ + +void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, + unsigned char *dest); + +#ifdef __cplusplus +} +#endif + +#endif //_GF_VECT_MUL_H diff --git a/src/isa-l/include/igzip_lib.h b/src/isa-l/include/igzip_lib.h new file mode 100644 index 000000000..57333748b --- /dev/null +++ b/src/isa-l/include/igzip_lib.h @@ -0,0 +1,990 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _IGZIP_H +#define _IGZIP_H + +/** + * @file igzip_lib.h + * + * @brief This file defines the igzip compression and decompression interface, a + * high performance deflate compression interface for storage applications. + * + * Deflate is a widely used compression standard that can be used standalone, it + * also forms the basis of gzip and zlib compression formats. Igzip supports the + * following flush features: + * + * - No Flush: The default method where no special flush is performed. + * + * - Sync flush: whereby isal_deflate() finishes the current deflate block at + * the end of each input buffer. The deflate block is byte aligned by + * appending an empty stored block. + * + * - Full flush: whereby isal_deflate() finishes and aligns the deflate block as + * in sync flush but also ensures that subsequent block's history does not + * look back beyond this point and new blocks are fully independent. + * + * Igzip also supports compression levels from ISAL_DEF_MIN_LEVEL to + * ISAL_DEF_MAX_LEVEL. + * + * Igzip contains some behavior configurable at compile time. These + * configurable options are: + * + * - IGZIP_HIST_SIZE - Defines the window size. The default value is 32K (note K + * represents 1024), but 8K is also supported. Powers of 2 which are at most + * 32K may also work. + * + * - LONGER_HUFFTABLES - Defines whether to use a larger hufftables structure + * which may increase performance with smaller IGZIP_HIST_SIZE values. By + * default this option is not defined. This define sets IGZIP_HIST_SIZE to be + * 8 if IGZIP_HIST_SIZE > 8K. + * + * As an example, to compile gzip with an 8K window size, in a terminal run + * @verbatim gmake D="-D IGZIP_HIST_SIZE=8*1024" @endverbatim on Linux and + * FreeBSD, or with @verbatim nmake -f Makefile.nmake D="-D + * IGZIP_HIST_SIZE=8*1024" @endverbatim on Windows. + * + */ +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ +/* Deflate Compression Standard Defines */ +/******************************************************************************/ +#define IGZIP_K 1024 +#define ISAL_DEF_MAX_HDR_SIZE 328 +#define ISAL_DEF_MAX_CODE_LEN 15 +#define ISAL_DEF_HIST_SIZE (32*IGZIP_K) +#define ISAL_DEF_MAX_HIST_BITS 15 +#define ISAL_DEF_MAX_MATCH 258 +#define ISAL_DEF_MIN_MATCH 3 + +#define ISAL_DEF_LIT_SYMBOLS 257 +#define ISAL_DEF_LEN_SYMBOLS 29 +#define ISAL_DEF_DIST_SYMBOLS 30 +#define ISAL_DEF_LIT_LEN_SYMBOLS (ISAL_DEF_LIT_SYMBOLS + ISAL_DEF_LEN_SYMBOLS) + +/* Max repeat length, rounded up to 32 byte boundary */ +#define ISAL_LOOK_AHEAD ((ISAL_DEF_MAX_MATCH + 31) & ~31) + +/******************************************************************************/ +/* Deflate Implementation Specific Defines */ +/******************************************************************************/ +/* Note IGZIP_HIST_SIZE must be a power of two */ +#ifndef IGZIP_HIST_SIZE +#define IGZIP_HIST_SIZE ISAL_DEF_HIST_SIZE +#endif + +#if (IGZIP_HIST_SIZE > ISAL_DEF_HIST_SIZE) +#undef IGZIP_HIST_SIZE +#define IGZIP_HIST_SIZE ISAL_DEF_HIST_SIZE +#endif + +#ifdef LONGER_HUFFTABLE +#if (IGZIP_HIST_SIZE > 8 * IGZIP_K) +#undef IGZIP_HIST_SIZE +#define IGZIP_HIST_SIZE (8 * IGZIP_K) +#endif +#endif + +#define ISAL_LIMIT_HASH_UPDATE + +#define IGZIP_HASH8K_HASH_SIZE (8 * IGZIP_K) +#define IGZIP_HASH_HIST_SIZE IGZIP_HIST_SIZE +#define IGZIP_HASH_MAP_HASH_SIZE IGZIP_HIST_SIZE + +#define IGZIP_LVL0_HASH_SIZE (8 * IGZIP_K) +#define IGZIP_LVL1_HASH_SIZE IGZIP_HASH8K_HASH_SIZE +#define IGZIP_LVL2_HASH_SIZE IGZIP_HASH_HIST_SIZE +#define IGZIP_LVL3_HASH_SIZE IGZIP_HASH_MAP_HASH_SIZE + +#ifdef LONGER_HUFFTABLE +enum {IGZIP_DIST_TABLE_SIZE = 8*1024}; + +/* DECODE_OFFSET is dist code index corresponding to DIST_TABLE_SIZE + 1 */ +enum { IGZIP_DECODE_OFFSET = 26 }; +#else +enum {IGZIP_DIST_TABLE_SIZE = 2}; +/* DECODE_OFFSET is dist code index corresponding to DIST_TABLE_SIZE + 1 */ +enum { IGZIP_DECODE_OFFSET = 0 }; +#endif +enum {IGZIP_LEN_TABLE_SIZE = 256}; +enum {IGZIP_LIT_TABLE_SIZE = ISAL_DEF_LIT_SYMBOLS}; + +#define IGZIP_HUFFTABLE_CUSTOM 0 +#define IGZIP_HUFFTABLE_DEFAULT 1 +#define IGZIP_HUFFTABLE_STATIC 2 + +/* Flush Flags */ +#define NO_FLUSH 0 /* Default */ +#define SYNC_FLUSH 1 +#define FULL_FLUSH 2 +#define FINISH_FLUSH 0 /* Deprecated */ + +/* Gzip Flags */ +#define IGZIP_DEFLATE 0 /* Default */ +#define IGZIP_GZIP 1 +#define IGZIP_GZIP_NO_HDR 2 +#define IGZIP_ZLIB 3 +#define IGZIP_ZLIB_NO_HDR 4 + +/* Compression Return values */ +#define COMP_OK 0 +#define INVALID_FLUSH -7 +#define INVALID_PARAM -8 +#define STATELESS_OVERFLOW -1 +#define ISAL_INVALID_OPERATION -9 +#define ISAL_INVALID_STATE -3 +#define ISAL_INVALID_LEVEL -4 /* Invalid Compression level set */ +#define ISAL_INVALID_LEVEL_BUF -5 /* Invalid buffer specified for the compression level */ + +/** + * @enum isal_zstate_state + * @brief Compression State please note ZSTATE_TRL only applies for GZIP compression + */ + + +/* When the state is set to ZSTATE_NEW_HDR or TMP_ZSTATE_NEW_HEADER, the + * hufftable being used for compression may be swapped + */ +enum isal_zstate_state { + ZSTATE_NEW_HDR, //!< Header to be written + ZSTATE_HDR, //!< Header state + ZSTATE_CREATE_HDR, //!< Header to be created + ZSTATE_BODY, //!< Body state + ZSTATE_FLUSH_READ_BUFFER, //!< Flush buffer + ZSTATE_FLUSH_ICF_BUFFER, + ZSTATE_TYPE0_HDR, //! Type0 block header to be written + ZSTATE_TYPE0_BODY, //!< Type0 block body to be written + ZSTATE_SYNC_FLUSH, //!< Write sync flush block + ZSTATE_FLUSH_WRITE_BUFFER, //!< Flush bitbuf + ZSTATE_TRL, //!< Trailer state + ZSTATE_END, //!< End state + ZSTATE_TMP_NEW_HDR, //!< Temporary Header to be written + ZSTATE_TMP_HDR, //!< Temporary Header state + ZSTATE_TMP_CREATE_HDR, //!< Temporary Header to be created state + ZSTATE_TMP_BODY, //!< Temporary Body state + ZSTATE_TMP_FLUSH_READ_BUFFER, //!< Flush buffer + ZSTATE_TMP_FLUSH_ICF_BUFFER, + ZSTATE_TMP_TYPE0_HDR, //! Temporary Type0 block header to be written + ZSTATE_TMP_TYPE0_BODY, //!< Temporary Type0 block body to be written + ZSTATE_TMP_SYNC_FLUSH, //!< Write sync flush block + ZSTATE_TMP_FLUSH_WRITE_BUFFER, //!< Flush bitbuf + ZSTATE_TMP_TRL, //!< Temporary Trailer state + ZSTATE_TMP_END //!< Temporary End state +}; + +/* Offset used to switch between TMP states and non-tmp states */ +#define ZSTATE_TMP_OFFSET ZSTATE_TMP_HDR - ZSTATE_HDR + +/******************************************************************************/ +/* Inflate Implementation Specific Defines */ +/******************************************************************************/ +#define ISAL_DECODE_LONG_BITS 12 +#define ISAL_DECODE_SHORT_BITS 10 + +/* Current state of decompression */ +enum isal_block_state { + ISAL_BLOCK_NEW_HDR, /* Just starting a new block */ + ISAL_BLOCK_HDR, /* In the middle of reading in a block header */ + ISAL_BLOCK_TYPE0, /* Decoding a type 0 block */ + ISAL_BLOCK_CODED, /* Decoding a huffman coded block */ + ISAL_BLOCK_INPUT_DONE, /* Decompression of input is completed */ + ISAL_BLOCK_FINISH, /* Decompression of input is completed and all data has been flushed to output */ + ISAL_GZIP_EXTRA_LEN, + ISAL_GZIP_EXTRA, + ISAL_GZIP_NAME, + ISAL_GZIP_COMMENT, + ISAL_GZIP_HCRC, + ISAL_ZLIB_DICT, + ISAL_CHECKSUM_CHECK, +}; + + +/* Inflate Flags */ +#define ISAL_DEFLATE 0 /* Default */ +#define ISAL_GZIP 1 +#define ISAL_GZIP_NO_HDR 2 +#define ISAL_ZLIB 3 +#define ISAL_ZLIB_NO_HDR 4 +#define ISAL_ZLIB_NO_HDR_VER 5 +#define ISAL_GZIP_NO_HDR_VER 6 + +/* Inflate Return values */ +#define ISAL_DECOMP_OK 0 /* No errors encountered while decompressing */ +#define ISAL_END_INPUT 1 /* End of input reached */ +#define ISAL_OUT_OVERFLOW 2 /* End of output reached */ +#define ISAL_NAME_OVERFLOW 3 /* End of gzip name buffer reached */ +#define ISAL_COMMENT_OVERFLOW 4 /* End of gzip name buffer reached */ +#define ISAL_EXTRA_OVERFLOW 5 /* End of extra buffer reached */ +#define ISAL_NEED_DICT 6 /* Stream needs a dictionary to continue */ +#define ISAL_INVALID_BLOCK -1 /* Invalid deflate block found */ +#define ISAL_INVALID_SYMBOL -2 /* Invalid deflate symbol found */ +#define ISAL_INVALID_LOOKBACK -3 /* Invalid lookback distance found */ +#define ISAL_INVALID_WRAPPER -4 /* Invalid gzip/zlib wrapper found */ +#define ISAL_UNSUPPORTED_METHOD -5 /* Gzip/zlib wrapper specifies unsupported compress method */ +#define ISAL_INCORRECT_CHECKSUM -6 /* Incorrect checksum found */ + +/******************************************************************************/ +/* Compression structures */ +/******************************************************************************/ +/** @brief Holds histogram of deflate symbols*/ +struct isal_huff_histogram { + uint64_t lit_len_histogram[ISAL_DEF_LIT_LEN_SYMBOLS]; //!< Histogram of Literal/Len symbols seen + uint64_t dist_histogram[ISAL_DEF_DIST_SYMBOLS]; //!< Histogram of Distance Symbols seen + uint16_t hash_table[IGZIP_LVL0_HASH_SIZE]; //!< Tmp space used as a hash table +}; + +struct isal_mod_hist { + uint32_t d_hist[30]; + uint32_t ll_hist[513]; +}; + +#define ISAL_DEF_MIN_LEVEL 0 +#define ISAL_DEF_MAX_LEVEL 3 + +/* Defines used set level data sizes */ +/* has to be at least sizeof(struct level_buf) + sizeof(struct lvlX_buf */ +#define ISAL_DEF_LVL0_REQ 0 +#define ISAL_DEF_LVL1_REQ (4 * IGZIP_K + 2 * IGZIP_LVL1_HASH_SIZE) +#define ISAL_DEF_LVL1_TOKEN_SIZE 4 +#define ISAL_DEF_LVL2_REQ (4 * IGZIP_K + 2 * IGZIP_LVL2_HASH_SIZE) +#define ISAL_DEF_LVL2_TOKEN_SIZE 4 +#define ISAL_DEF_LVL3_REQ 4 * IGZIP_K + 4 * 4 * IGZIP_K + 2 * IGZIP_LVL3_HASH_SIZE +#define ISAL_DEF_LVL3_TOKEN_SIZE 4 + +/* Data sizes for level specific data options */ +#define ISAL_DEF_LVL0_MIN ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_SMALL ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_MEDIUM ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_LARGE ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_EXTRA_LARGE ISAL_DEF_LVL0_REQ +#define ISAL_DEF_LVL0_DEFAULT ISAL_DEF_LVL0_REQ + +#define ISAL_DEF_LVL1_MIN (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 1 * IGZIP_K) +#define ISAL_DEF_LVL1_SMALL (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 16 * IGZIP_K) +#define ISAL_DEF_LVL1_MEDIUM (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 32 * IGZIP_K) +#define ISAL_DEF_LVL1_LARGE (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 64 * IGZIP_K) +#define ISAL_DEF_LVL1_EXTRA_LARGE (ISAL_DEF_LVL1_REQ + ISAL_DEF_LVL1_TOKEN_SIZE * 128 * IGZIP_K) +#define ISAL_DEF_LVL1_DEFAULT ISAL_DEF_LVL1_LARGE + +#define ISAL_DEF_LVL2_MIN (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 1 * IGZIP_K) +#define ISAL_DEF_LVL2_SMALL (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 16 * IGZIP_K) +#define ISAL_DEF_LVL2_MEDIUM (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 32 * IGZIP_K) +#define ISAL_DEF_LVL2_LARGE (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 64 * IGZIP_K) +#define ISAL_DEF_LVL2_EXTRA_LARGE (ISAL_DEF_LVL2_REQ + ISAL_DEF_LVL2_TOKEN_SIZE * 128 * IGZIP_K) +#define ISAL_DEF_LVL2_DEFAULT ISAL_DEF_LVL2_LARGE + +#define ISAL_DEF_LVL3_MIN (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 1 * IGZIP_K) +#define ISAL_DEF_LVL3_SMALL (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 16 * IGZIP_K) +#define ISAL_DEF_LVL3_MEDIUM (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 32 * IGZIP_K) +#define ISAL_DEF_LVL3_LARGE (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 64 * IGZIP_K) +#define ISAL_DEF_LVL3_EXTRA_LARGE (ISAL_DEF_LVL3_REQ + ISAL_DEF_LVL3_TOKEN_SIZE * 128 * IGZIP_K) +#define ISAL_DEF_LVL3_DEFAULT ISAL_DEF_LVL3_LARGE + +#define IGZIP_NO_HIST 0 +#define IGZIP_HIST 1 +#define IGZIP_DICT_HIST 2 +#define IGZIP_DICT_HASH_SET 3 + +/** @brief Holds Bit Buffer information*/ +struct BitBuf2 { + uint64_t m_bits; //!< bits in the bit buffer + uint32_t m_bit_count; //!< number of valid bits in the bit buffer + uint8_t *m_out_buf; //!< current index of buffer to write to + uint8_t *m_out_end; //!< end of buffer to write to + uint8_t *m_out_start; //!< start of buffer to write to +}; + +struct isal_zlib_header { + uint32_t info; //!< base-2 logarithm of the LZ77 window size minus 8 + uint32_t level; //!< Compression level (fastest, fast, default, maximum) + uint32_t dict_id; //!< Dictionary id + uint32_t dict_flag; //!< Whether to use a dictionary +}; + +struct isal_gzip_header { + uint32_t text; //!< Optional Text hint + uint32_t time; //!< Unix modification time in gzip header + uint32_t xflags; //!< xflags in gzip header + uint32_t os; //!< OS in gzip header + uint8_t *extra; //!< Extra field in gzip header + uint32_t extra_buf_len; //!< Length of extra buffer + uint32_t extra_len; //!< Actual length of gzip header extra field + char *name; //!< Name in gzip header + uint32_t name_buf_len; //!< Length of name buffer + char *comment; //!< Comments in gzip header + uint32_t comment_buf_len; //!< Length of comment buffer + uint32_t hcrc; //!< Header crc or header crc flag + uint32_t flags; //!< Internal data +}; + +/* Variable prefixes: + * b_ : Measured wrt the start of the buffer + * f_ : Measured wrt the start of the file (aka file_start) + */ + +/** @brief Holds the internal state information for input and output compression streams*/ +struct isal_zstate { + uint32_t total_in_start; //!< Not used, may be replaced with something else + uint32_t block_next; //!< Start of current deflate block in the input + uint32_t block_end; //!< End of current deflate block in the input + uint32_t dist_mask; //!< Distance mask used. + uint32_t hash_mask; + enum isal_zstate_state state; //!< Current state in processing the data stream + struct BitBuf2 bitbuf; //!< Bit Buffer + uint32_t crc; //!< Current checksum without finalize step if any (adler) + uint8_t has_wrap_hdr; //!< keeps track of wrapper header + uint8_t has_eob_hdr; //!< keeps track of eob hdr (with BFINAL set) + uint8_t has_eob; //!< keeps track of eob on the last deflate block + uint8_t has_hist; //!< flag to track if there is match history + uint16_t has_level_buf_init; //!< flag to track if user supplied memory has been initialized. + uint32_t count; //!< used for partial header/trailer writes + uint8_t tmp_out_buff[16]; //!< temporary array + uint32_t tmp_out_start; //!< temporary variable + uint32_t tmp_out_end; //!< temporary variable + uint32_t b_bytes_valid; //!< number of valid bytes in buffer + uint32_t b_bytes_processed; //!< number of bytes processed in buffer + uint8_t buffer[2 * IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD]; //!< Internal buffer + + /* Stream should be setup such that the head is cache aligned*/ + uint16_t head[IGZIP_LVL0_HASH_SIZE]; //!< Hash array +}; + +/** @brief Holds the huffman tree used to huffman encode the input stream **/ +struct isal_hufftables { + + uint8_t deflate_hdr[ISAL_DEF_MAX_HDR_SIZE]; //!< deflate huffman tree header + uint32_t deflate_hdr_count; //!< Number of whole bytes in deflate_huff_hdr + uint32_t deflate_hdr_extra_bits; //!< Number of bits in the partial byte in header + uint32_t dist_table[IGZIP_DIST_TABLE_SIZE]; //!< bits 4:0 are the code length, bits 31:5 are the code + uint32_t len_table[IGZIP_LEN_TABLE_SIZE]; //!< bits 4:0 are the code length, bits 31:5 are the code + uint16_t lit_table[IGZIP_LIT_TABLE_SIZE]; //!< literal code + uint8_t lit_table_sizes[IGZIP_LIT_TABLE_SIZE]; //!< literal code length + uint16_t dcodes[30 - IGZIP_DECODE_OFFSET]; //!< distance code + uint8_t dcodes_sizes[30 - IGZIP_DECODE_OFFSET]; //!< distance code length + +}; + +/** @brief Holds stream information*/ +struct isal_zstream { + uint8_t *next_in; //!< Next input byte + uint32_t avail_in; //!< number of bytes available at next_in + uint32_t total_in; //!< total number of bytes read so far + + uint8_t *next_out; //!< Next output byte + uint32_t avail_out; //!< number of bytes available at next_out + uint32_t total_out; //!< total number of bytes written so far + + struct isal_hufftables *hufftables; //!< Huffman encoding used when compressing + uint32_t level; //!< Compression level to use + uint32_t level_buf_size; //!< Size of level_buf + uint8_t * level_buf; //!< User allocated buffer required for different compression levels + uint16_t end_of_stream; //!< non-zero if this is the last input buffer + uint16_t flush; //!< Flush type can be NO_FLUSH, SYNC_FLUSH or FULL_FLUSH + uint16_t gzip_flag; //!< Indicate if gzip compression is to be performed + uint16_t hist_bits; //!< Log base 2 of maximum lookback distance, 0 is use default + struct isal_zstate internal_state; //!< Internal state for this stream +}; + +/******************************************************************************/ +/* Inflate structures */ +/******************************************************************************/ +/* + * Inflate_huff_code data structures are used to store a Huffman code for fast + * lookup. It works by performing a lookup in small_code_lookup that hopefully + * yields the correct symbol. Otherwise a lookup into long_code_lookup is + * performed to find the correct symbol. The details of how this works follows: + * + * Let i be some index into small_code_lookup and let e be the associated + * element. Bit 15 in e is a flag. If bit 15 is not set, then index i contains + * a Huffman code for a symbol which has length at most DECODE_LOOKUP_SIZE. Bits + * 0 through 8 are the symbol associated with that code and bits 9 through 12 of + * e represent the number of bits in the code. If bit 15 is set, the i + * corresponds to the first DECODE_LOOKUP_SIZE bits of a Huffman code which has + * length longer than DECODE_LOOKUP_SIZE. In this case, bits 0 through 8 + * represent an offset into long_code_lookup table and bits 9 through 12 + * represent the maximum length of a Huffman code starting with the bits in the + * index i. The offset into long_code_lookup is for an array associated with all + * codes which start with the bits in i. + * + * The elements of long_code_lookup are in the same format as small_code_lookup, + * except bit 15 is never set. Let i be a number made up of DECODE_LOOKUP_SIZE + * bits. Then all Huffman codes which start with DECODE_LOOKUP_SIZE bits are + * stored in an array starting at index h in long_code_lookup. This index h is + * stored in bits 0 through 9 at index i in small_code_lookup. The index j is an + * index of this array if the number of bits contained in j and i is the number + * of bits in the longest huff_code starting with the bits of i. The symbol + * stored at index j is the symbol whose huffcode can be found in (j << + * DECODE_LOOKUP_SIZE) | i. Note these arrays will be stored sorted in order of + * maximum Huffman code length. + * + * The following are explanations for sizes of the tables: + * + * Since small_code_lookup is a lookup on DECODE_LOOKUP_SIZE bits, it must have + * size 2^DECODE_LOOKUP_SIZE. + * + * To determine the amount of memory required for long_code_lookup, note that + * any element of long_code_lookup corresponds to a code, a duplicate of an + * existing code, or a invalid code. Since deflate Huffman are stored such that + * the code size and the code value form an increasing function, the number of + * duplicates is maximized when all the duplicates are contained in a single + * array, thus there are at most 2^(15 - DECODE_LOOKUP_SIZE) - + * (DECODE_LOOKUP_SIZE + 1) duplicate elements. Similarly the number of invalid + * elements is maximized at 2^(15 - DECODE_LOOKUP_SIZE) - 2^(floor((15 - + * DECODE_LOOKUP_SIZE)/2) - 2^(ceil((15 - DECODE_LOOKUP_SIZE)/2) + 1. Thus the + * amount of memory required is: NUM_CODES + 2^(16 - DECODE_LOOKUP_SIZE) - + * (DECODE_LOOKUP_SIZE + 1) - 2^(floor((15 - DECODE_LOOKUP_SIZE)/2) - + * 2^(ceil((15 - DECODE_LOOKUP_SIZE)/2) + 1. The values used below are those + * values rounded up to the nearest 16 byte boundary + * + * Note that DECODE_LOOKUP_SIZE can be any length even though the offset in + * small_lookup_code is 9 bits long because the increasing relationship between + * code length and code value forces the maximum offset to be less than 288. + */ + +/* In the following defines, L stands for LARGE and S for SMALL */ +#define ISAL_L_REM (21 - ISAL_DECODE_LONG_BITS) +#define ISAL_S_REM (15 - ISAL_DECODE_SHORT_BITS) + +#define ISAL_L_DUP ((1 << ISAL_L_REM) - (ISAL_L_REM + 1)) +#define ISAL_S_DUP ((1 << ISAL_S_REM) - (ISAL_S_REM + 1)) + +#define ISAL_L_UNUSED ((1 << ISAL_L_REM) - (1 << ((ISAL_L_REM)/2)) - (1 << ((ISAL_L_REM + 1)/2)) + 1) +#define ISAL_S_UNUSED ((1 << ISAL_S_REM) - (1 << ((ISAL_S_REM)/2)) - (1 << ((ISAL_S_REM + 1)/2)) + 1) + +#define ISAL_L_SIZE (ISAL_DEF_LIT_LEN_SYMBOLS + ISAL_L_DUP + ISAL_L_UNUSED) +#define ISAL_S_SIZE (ISAL_DEF_DIST_SYMBOLS + ISAL_S_DUP + ISAL_S_UNUSED) + +#define ISAL_HUFF_CODE_LARGE_LONG_ALIGNED (ISAL_L_SIZE + (-ISAL_L_SIZE & 0xf)) +#define ISAL_HUFF_CODE_SMALL_LONG_ALIGNED (ISAL_S_SIZE + (-ISAL_S_SIZE & 0xf)) + +/* Large lookup table for decoding huffman codes */ +struct inflate_huff_code_large { + uint32_t short_code_lookup[1 << (ISAL_DECODE_LONG_BITS)]; + uint16_t long_code_lookup[ISAL_HUFF_CODE_LARGE_LONG_ALIGNED]; +}; + +/* Small lookup table for decoding huffman codes */ +struct inflate_huff_code_small { + uint16_t short_code_lookup[1 << (ISAL_DECODE_SHORT_BITS)]; + uint16_t long_code_lookup[ISAL_HUFF_CODE_SMALL_LONG_ALIGNED]; +}; + +/** @brief Holds decompression state information*/ +struct inflate_state { + uint8_t *next_out; //!< Next output Byte + uint32_t avail_out; //!< Number of bytes available at next_out + uint32_t total_out; //!< Total bytes written out so far + uint8_t *next_in; //!< Next input byte + uint64_t read_in; //!< Bits buffered to handle unaligned streams + uint32_t avail_in; //!< Number of bytes available at next_in + int32_t read_in_length; //!< Bits in read_in + struct inflate_huff_code_large lit_huff_code; //!< Structure for decoding lit/len symbols + struct inflate_huff_code_small dist_huff_code; //!< Structure for decoding dist symbols + enum isal_block_state block_state; //!< Current decompression state + uint32_t dict_length; //!< Length of dictionary used + uint32_t bfinal; //!< Flag identifying final block + uint32_t crc_flag; //!< Flag identifying whether to track of crc + uint32_t crc; //!< Contains crc or adler32 of output if crc_flag is set + uint32_t hist_bits; //!< Log base 2 of maximum lookback distance + union { + int32_t type0_block_len; //!< Length left to read of type 0 block when outbuffer overflow occurred + int32_t count; //!< Count of bytes remaining to be parsed + uint32_t dict_id; + }; + int32_t write_overflow_lits; + int32_t write_overflow_len; + int32_t copy_overflow_length; //!< Length left to copy when outbuffer overflow occurred + int32_t copy_overflow_distance; //!< Lookback distance when outbuffer overflow occurred + int16_t wrapper_flag; + int16_t tmp_in_size; //!< Number of bytes in tmp_in_buffer + int32_t tmp_out_valid; //!< Number of bytes in tmp_out_buffer + int32_t tmp_out_processed; //!< Number of bytes processed in tmp_out_buffer + uint8_t tmp_in_buffer[ISAL_DEF_MAX_HDR_SIZE]; //!< Temporary buffer containing data from the input stream + uint8_t tmp_out_buffer[2 * ISAL_DEF_HIST_SIZE + ISAL_LOOK_AHEAD]; //!< Temporary buffer containing data from the output stream +}; + +/******************************************************************************/ +/* Compression functions */ +/******************************************************************************/ +/** + * @brief Updates histograms to include the symbols found in the input + * stream. Since this function only updates the histograms, it can be called on + * multiple streams to get a histogram better representing the desired data + * set. When first using histogram it must be initialized by zeroing the + * structure. + * + * @param in_stream: Input stream of data. + * @param length: The length of start_stream. + * @param histogram: The returned histogram of lit/len/dist symbols. + */ +void isal_update_histogram(uint8_t * in_stream, int length, struct isal_huff_histogram * histogram); + + +/** + * @brief Creates a custom huffman code for the given histograms in which + * every literal and repeat length is assigned a code and all possible lookback + * distances are assigned a code. + * + * @param hufftables: the output structure containing the huffman code + * @param histogram: histogram containing frequency of literal symbols, + * repeat lengths and lookback distances + * @returns Returns a non zero value if an invalid huffman code was created. + */ +int isal_create_hufftables(struct isal_hufftables * hufftables, + struct isal_huff_histogram * histogram); + +/** + * @brief Creates a custom huffman code for the given histograms like + * isal_create_hufftables() except literals with 0 frequency in the histogram + * are not assigned a code + * + * @param hufftables: the output structure containing the huffman code + * @param histogram: histogram containing frequency of literal symbols, + * repeat lengths and lookback distances + * @returns Returns a non zero value if an invalid huffman code was created. + */ +int isal_create_hufftables_subset(struct isal_hufftables * hufftables, + struct isal_huff_histogram * histogram); + +/** + * @brief Initialize compression stream data structure + * + * @param stream Structure holding state information on the compression streams. + * @returns none + */ +void isal_deflate_init(struct isal_zstream *stream); + +/** + * @brief Reinitialize compression stream data structure. Performs the same + * action as isal_deflate_init, but does not change user supplied input such as + * the level, flush type, compression wrapper (like gzip), hufftables, and + * end_of_stream_flag. + * + * @param stream Structure holding state information on the compression streams. + * @returns none + */ +void isal_deflate_reset(struct isal_zstream *stream); + + +/** + * @brief Set gzip header default values + * + * @param gz_hdr: Gzip header to initialize. + */ +void isal_gzip_header_init(struct isal_gzip_header *gz_hdr); + +/** + * @brief Write gzip header to output stream + * + * Writes the gzip header to the output stream. On entry this function assumes + * that the output buffer has been initialized, so stream->next_out, + * stream->avail_out and stream->total_out have been set. If the output buffer + * contains insufficient space, stream is not modified. + * + * @param stream: Structure holding state information on the compression stream. + * @param gz_hdr: Structure holding the gzip header information to encode. + * + * @returns Returns 0 if the header is successfully written, otherwise returns + * the minimum size required to successfully write the gzip header to the output + * buffer. + */ +uint32_t isal_write_gzip_header(struct isal_zstream * stream, struct isal_gzip_header *gz_hdr); + +/** + * @brief Write zlib header to output stream + * + * Writes the zlib header to the output stream. On entry this function assumes + * that the output buffer has been initialized, so stream->next_out, + * stream->avail_out and stream->total_out have been set. If the output buffer + * contains insufficient space, stream is not modified. + * + * @param stream: Structure holding state information on the compression stream. + * @param z_hdr: Structure holding the zlib header information to encode. + * + * @returns Returns 0 if the header is successfully written, otherwise returns + * the minimum size required to successfully write the zlib header to the output + * buffer. + */ +uint32_t isal_write_zlib_header(struct isal_zstream * stream, struct isal_zlib_header *z_hdr); + +/** + * @brief Set stream to use a new Huffman code + * + * Sets the Huffman code to be used in compression before compression start or + * after the successful completion of a SYNC_FLUSH or FULL_FLUSH. If type has + * value IGZIP_HUFFTABLE_DEFAULT, the stream is set to use the default Huffman + * code. If type has value IGZIP_HUFFTABLE_STATIC, the stream is set to use the + * deflate standard static Huffman code, or if type has value + * IGZIP_HUFFTABLE_CUSTOM, the stream is set to sue the isal_hufftables + * structure input to isal_deflate_set_hufftables. + * + * @param stream: Structure holding state information on the compression stream. + * @param hufftables: new huffman code to use if type is set to + * IGZIP_HUFFTABLE_CUSTOM. + * @param type: Flag specifying what hufftable to use. + * + * @returns Returns INVALID_OPERATION if the stream was unmodified. This may be + * due to the stream being in a state where changing the huffman code is not + * allowed or an invalid input is provided. + */ +int isal_deflate_set_hufftables(struct isal_zstream *stream, + struct isal_hufftables *hufftables, int type); + +/** + * @brief Initialize compression stream data structure + * + * @param stream Structure holding state information on the compression streams. + * @returns none + */ +void isal_deflate_stateless_init(struct isal_zstream *stream); + + +/** + * @brief Set compression dictionary to use + * + * This function is to be called after isal_deflate_init, or after completing a + * SYNC_FLUSH or FULL_FLUSH and before the next call do isal_deflate. If the + * dictionary is longer than IGZIP_HIST_SIZE, only the last IGZIP_HIST_SIZE + * bytes will be used. + * + * @param stream Structure holding state information on the compression streams. + * @param dict: Array containing dictionary to use. + * @param dict_len: Length of dict. + * @returns COMP_OK, + * ISAL_INVALID_STATE (dictionary could not be set) + */ +int isal_deflate_set_dict(struct isal_zstream *stream, uint8_t *dict, uint32_t dict_len); + +/** @brief Structure for holding processed dictionary information */ + +struct isal_dict { + uint32_t params; + uint32_t level; + uint32_t hist_size; + uint32_t hash_size; + uint8_t history[ISAL_DEF_HIST_SIZE]; + uint16_t hashtable[IGZIP_LVL3_HASH_SIZE]; +}; + +/** + * @brief Process dictionary to reuse later + * + * Processes a dictionary so that the generated output can be reused to reset a + * new deflate stream more quickly than isal_deflate_set_dict() alone. This + * function is paired with isal_deflate_reset_dict() when using the same + * dictionary on multiple deflate objects. The stream.level must be set prior to + * calling this function to process the dictionary correctly. If the dictionary + * is longer than IGZIP_HIST_SIZE, only the last IGZIP_HIST_SIZE bytes will be + * used. + * + * @param stream Structure holding state information on the compression streams. + * @param dict_str: Structure to hold processed dictionary info to reuse later. + * @param dict: Array containing dictionary to use. + * @param dict_len: Length of dict. + * @returns COMP_OK, + * ISAL_INVALID_STATE (dictionary could not be processed) + */ +int isal_deflate_process_dict(struct isal_zstream *stream, struct isal_dict *dict_str, + uint8_t *dict, uint32_t dict_len); + +/** + * @brief Reset compression dictionary to use + * + * Similar to isal_deflate_set_dict() but on pre-processed dictionary + * data. Pairing with isal_deflate_process_dict() can reduce the processing time + * on subsequent compression with dictionary especially on small files. + * + * Like isal_deflate_set_dict(), this function is to be called after + * isal_deflate_init, or after completing a SYNC_FLUSH or FULL_FLUSH and before + * the next call do isal_deflate. Changing compression level between dictionary + * process and reset will cause return of ISAL_INVALID_STATE. + * + * @param stream Structure holding state information on the compression streams. + * @param dict_str: Structure with pre-processed dictionary info. + * @returns COMP_OK, + * ISAL_INVALID_STATE or other (dictionary could not be reset) + */ +int isal_deflate_reset_dict(struct isal_zstream *stream, struct isal_dict *dict_str); + + +/** + * @brief Fast data (deflate) compression for storage applications. + * + * The call to isal_deflate() will take data from the input buffer (updating + * next_in, avail_in and write a compressed stream to the output buffer + * (updating next_out and avail_out). The function returns when either the input + * buffer is empty or the output buffer is full. + * + * On entry to isal_deflate(), next_in points to an input buffer and avail_in + * indicates the length of that buffer. Similarly next_out points to an empty + * output buffer and avail_out indicates the size of that buffer. + * + * The fields total_in and total_out start at 0 and are updated by + * isal_deflate(). These reflect the total number of bytes read or written so far. + * + * When the last input buffer is passed in, signaled by setting the + * end_of_stream, the routine will complete compression at the end of the input + * buffer, as long as the output buffer is big enough. + * + * The compression level can be set by setting level to any value between + * ISAL_DEF_MIN_LEVEL and ISAL_DEF_MAX_LEVEL. When the compression level is + * ISAL_DEF_MIN_LEVEL, hufftables can be set to a table trained for the the + * specific data type being compressed to achieve better compression. When a + * higher compression level is desired, a larger generic memory buffer needs to + * be supplied by setting level_buf and level_buf_size to represent the chunk of + * memory. For level x, the suggest size for this buffer this buffer is + * ISAL_DEFL_LVLx_DEFAULT. The defines ISAL_DEFL_LVLx_MIN, ISAL_DEFL_LVLx_SMALL, + * ISAL_DEFL_LVLx_MEDIUM, ISAL_DEFL_LVLx_LARGE, and ISAL_DEFL_LVLx_EXTRA_LARGE + * are also provided as other suggested sizes. + * + * The equivalent of the zlib FLUSH_SYNC operation is currently supported. + * Flush types can be NO_FLUSH, SYNC_FLUSH or FULL_FLUSH. Default flush type is + * NO_FLUSH. A SYNC_ OR FULL_ flush will byte align the deflate block by + * appending an empty stored block once all input has been compressed, including + * the buffered input. Checking that the out_buffer is not empty or that + * internal_state.state = ZSTATE_NEW_HDR is sufficient to guarantee all input + * has been flushed. Additionally FULL_FLUSH will ensure look back history does + * not include previous blocks so new blocks are fully independent. Switching + * between flush types is supported. + * + * If a compression dictionary is required, the dictionary can be set calling + * isal_deflate_set_dictionary before calling isal_deflate. + * + * If the gzip_flag is set to IGZIP_GZIP, a generic gzip header and the gzip + * trailer are written around the deflate compressed data. If gzip_flag is set + * to IGZIP_GZIP_NO_HDR, then only the gzip trailer is written. A full-featured + * header is supported by the isal_write_{gzip,zlib}_header() functions. + * + * @param stream Structure holding state information on the compression streams. + * @return COMP_OK (if everything is ok), + * INVALID_FLUSH (if an invalid FLUSH is selected), + * ISAL_INVALID_LEVEL (if an invalid compression level is selected), + * ISAL_INVALID_LEVEL_BUF (if the level buffer is not large enough). + */ +int isal_deflate(struct isal_zstream *stream); + + +/** + * @brief Fast data (deflate) stateless compression for storage applications. + * + * Stateless (one shot) compression routine with a similar interface to + * isal_deflate() but operates on entire input buffer at one time. Parameter + * avail_out must be large enough to fit the entire compressed output. Max + * expansion is limited to the input size plus the header size of a stored/raw + * block. + * + * When the compression level is set to 1, unlike in isal_deflate(), level_buf + * may be optionally set depending on what what performance is desired. + * + * For stateless the flush types NO_FLUSH and FULL_FLUSH are supported. + * FULL_FLUSH will byte align the output deflate block so additional blocks can + * be easily appended. + * + * If the gzip_flag is set to IGZIP_GZIP, a generic gzip header and the gzip + * trailer are written around the deflate compressed data. If gzip_flag is set + * to IGZIP_GZIP_NO_HDR, then only the gzip trailer is written. + * + * @param stream Structure holding state information on the compression streams. + * @return COMP_OK (if everything is ok), + * INVALID_FLUSH (if an invalid FLUSH is selected), + * ISAL_INVALID_LEVEL (if an invalid compression level is selected), + * ISAL_INVALID_LEVEL_BUF (if the level buffer is not large enough), + * STATELESS_OVERFLOW (if output buffer will not fit output). + */ +int isal_deflate_stateless(struct isal_zstream *stream); + + +/******************************************************************************/ +/* Inflate functions */ +/******************************************************************************/ +/** + * @brief Initialize decompression state data structure + * + * @param state Structure holding state information on the compression streams. + * @returns none + */ +void isal_inflate_init(struct inflate_state *state); + +/** + * @brief Reinitialize decompression state data structure + * + * @param state Structure holding state information on the compression streams. + * @returns none + */ +void isal_inflate_reset(struct inflate_state *state); + +/** + * @brief Set decompression dictionary to use + * + * This function is to be called after isal_inflate_init. If the dictionary is + * longer than IGZIP_HIST_SIZE, only the last IGZIP_HIST_SIZE bytes will be + * used. + * + * @param state: Structure holding state information on the decompression stream. + * @param dict: Array containing dictionary to use. + * @param dict_len: Length of dict. + * @returns COMP_OK, + * ISAL_INVALID_STATE (dictionary could not be set) + */ +int isal_inflate_set_dict(struct inflate_state *state, uint8_t *dict, uint32_t dict_len); + +/** + * @brief Read and return gzip header information + * + * On entry state must be initialized and next_in pointing to a gzip compressed + * buffer. The buffers gz_hdr->extra, gz_hdr->name, gz_hdr->comments and the + * buffer lengths must be set to record the corresponding field, or set to NULL + * to disregard that gzip header information. If one of these buffers overflows, + * the user can reallocate a larger buffer and call this function again to + * continue reading the header information. + * + * @param state: Structure holding state information on the decompression stream. + * @param gz_hdr: Structure to return data encoded in the gzip header + * @returns ISAL_DECOMP_OK (header was successfully parsed) + * ISAL_END_INPUT (all input was parsed), + * ISAL_NAME_OVERFLOW (gz_hdr->name overflowed while parsing), + * ISAL_COMMENT_OVERFLOW (gz_hdr->comment overflowed while parsing), + * ISAL_EXTRA_OVERFLOW (gz_hdr->extra overflowed while parsing), + * ISAL_INVALID_WRAPPER (invalid gzip header found), + * ISAL_UNSUPPORTED_METHOD (deflate is not the compression method), + * ISAL_INCORRECT_CHECKSUM (gzip header checksum was incorrect) + */ +int isal_read_gzip_header (struct inflate_state *state, struct isal_gzip_header *gz_hdr); + +/** + * @brief Read and return zlib header information + * + * On entry state must be initialized and next_in pointing to a zlib compressed + * buffer. + * + * @param state: Structure holding state information on the decompression stream. + * @param zlib_hdr: Structure to return data encoded in the zlib header + * @returns ISAL_DECOMP_OK (header was successfully parsed), + * ISAL_END_INPUT (all input was parsed), + * ISAL_UNSUPPORTED_METHOD (deflate is not the compression method), + * ISAL_INCORRECT_CHECKSUM (zlib header checksum was incorrect) + */ +int isal_read_zlib_header (struct inflate_state *state, struct isal_zlib_header *zlib_hdr); + +/** + * @brief Fast data (deflate) decompression for storage applications. + * + * On entry to isal_inflate(), next_in points to an input buffer and avail_in + * indicates the length of that buffer. Similarly next_out points to an empty + * output buffer and avail_out indicates the size of that buffer. + * + * The field total_out starts at 0 and is updated by isal_inflate(). This + * reflects the total number of bytes written so far. + * + * The call to isal_inflate() will take data from the input buffer (updating + * next_in, avail_in and write a decompressed stream to the output buffer + * (updating next_out and avail_out). The function returns when the input buffer + * is empty, the output buffer is full, invalid data is found, or in the case of + * zlib formatted data if a dictionary is specified. The current state of the + * decompression on exit can be read from state->block-state. + * + * If the crc_flag is set to ISAL_GZIP_NO_HDR the gzip crc of the output is + * stored in state->crc. Alternatively, if the crc_flag is set to + * ISAL_ZLIB_NO_HDR the adler32 of the output is stored in state->crc (checksum + * may not be updated until decompression is complete). When the crc_flag is set + * to ISAL_GZIP_NO_HDR_VER or ISAL_ZLIB_NO_HDR_VER, the behavior is the same, + * except the checksum is verified with the checksum after immediately following + * the deflate data. If the crc_flag is set to ISAL_GZIP or ISAL_ZLIB, the + * gzip/zlib header is parsed, state->crc is set to the appropriate checksum, + * and the checksum is verified. If the crc_flag is set to ISAL_DEFLATE + * (default), then the data is treated as a raw deflate block. + * + * The element state->hist_bits has values from 0 to 15, where values of 1 to 15 + * are the log base 2 size of the matching window and 0 is the default with + * maximum history size. + * + * If a dictionary is required, a call to isal_inflate_set_dict will set the + * dictionary. + * + * @param state Structure holding state information on the compression streams. + * @return ISAL_DECOMP_OK (if everything is ok), + * ISAL_INVALID_BLOCK, + * ISAL_NEED_DICT, + * ISAL_INVALID_SYMBOL, + * ISAL_INVALID_LOOKBACK, + * ISAL_INVALID_WRAPPER, + * ISAL_UNSUPPORTED_METHOD, + * ISAL_INCORRECT_CHECKSUM. + */ + +int isal_inflate(struct inflate_state *state); + +/** + * @brief Fast data (deflate) stateless decompression for storage applications. + * + * Stateless (one shot) decompression routine with a similar interface to + * isal_inflate() but operates on entire input buffer at one time. Parameter + * avail_out must be large enough to fit the entire decompressed + * output. Dictionaries are not supported. + * + * @param state Structure holding state information on the compression streams. + * @return ISAL_DECOMP_OK (if everything is ok), + * ISAL_END_INPUT (if all input was decompressed), + * ISAL_NEED_DICT, + * ISAL_OUT_OVERFLOW (if output buffer ran out of space), + * ISAL_INVALID_BLOCK, + * ISAL_INVALID_SYMBOL, + * ISAL_INVALID_LOOKBACK, + * ISAL_INVALID_WRAPPER, + * ISAL_UNSUPPORTED_METHOD, + * ISAL_INCORRECT_CHECKSUM. + */ +int isal_inflate_stateless(struct inflate_state *state); + +/******************************************************************************/ +/* Other functions */ +/******************************************************************************/ +/** + * @brief Calculate Adler-32 checksum, runs appropriate version. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param init: initial Adler-32 value + * @param buf: buffer to calculate checksum on + * @param len: buffer length in bytes + * + * @returns 32-bit Adler-32 checksum + */ +uint32_t isal_adler32(uint32_t init, const unsigned char *buf, uint64_t len); + +#ifdef __cplusplus +} +#endif +#endif /* ifndef _IGZIP_H */ diff --git a/src/isa-l/include/mem_routines.h b/src/isa-l/include/mem_routines.h new file mode 100644 index 000000000..3d23522e9 --- /dev/null +++ b/src/isa-l/include/mem_routines.h @@ -0,0 +1,64 @@ +/********************************************************************** + Copyright(c) 2011-2018 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stddef.h> + +/** + * @file mem_routines.h + * @brief Interface to storage mem operations + * + * Defines the interface for vector versions of common memory functions. + */ + + +#ifndef _MEM_ROUTINES_H_ +#define _MEM_ROUTINES_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Detect if a memory region is all zero + * + * Zero detect function with optimizations for large blocks > 128 bytes + * + * @param mem Pointer to memory region to test + * @param len Length of region in bytes + * @returns 0 - region is all zeros + * other - region has non zero bytes + */ +int isal_zero_detect(void *mem, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif // _MEM_ROUTINES_H_ + diff --git a/src/isa-l/include/multibinary.asm b/src/isa-l/include/multibinary.asm new file mode 100644 index 000000000..588352a2f --- /dev/null +++ b/src/isa-l/include/multibinary.asm @@ -0,0 +1,440 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _MULTIBINARY_ASM_ +%define _MULTIBINARY_ASM_ + +%ifidn __OUTPUT_FORMAT__, elf32 + %define mbin_def_ptr dd + %define mbin_ptr_sz dword + %define mbin_rdi edi + %define mbin_rsi esi + %define mbin_rax eax + %define mbin_rbx ebx + %define mbin_rcx ecx + %define mbin_rdx edx +%else + %define mbin_def_ptr dq + %define mbin_ptr_sz qword + %define mbin_rdi rdi + %define mbin_rsi rsi + %define mbin_rax rax + %define mbin_rbx rbx + %define mbin_rcx rcx + %define mbin_rdx rdx +%endif + +%ifndef AS_FEATURE_LEVEL +%define AS_FEATURE_LEVEL 4 +%endif + +;;;; +; multibinary macro: +; creates the visable entry point that uses HW optimized call pointer +; creates the init of the HW optimized call pointer +;;;; +%macro mbin_interface 1 + ;;;; + ; *_dispatched is defaulted to *_mbinit and replaced on first call. + ; Therefore, *_dispatch_init is only executed on first call. + ;;;; + section .data + %1_dispatched: + mbin_def_ptr %1_mbinit + + section .text + mk_global %1, function + %1_mbinit: + endbranch + ;;; only called the first time to setup hardware match + call %1_dispatch_init + ;;; falls thru to execute the hw optimized code + %1: + endbranch + jmp mbin_ptr_sz [%1_dispatched] +%endmacro + +;;;;; +; mbin_dispatch_init parameters +; Use this function when SSE/00/01 is a minimum requirement +; 1-> function name +; 2-> SSE/00/01 optimized function used as base +; 3-> AVX or AVX/02 opt func +; 4-> AVX2 or AVX/04 opt func +;;;;; +%macro mbin_dispatch_init 4 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 + + mov eax, 1 + cpuid + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func + jne _%1_init_done ; AVX is not available so end + mov mbin_rsi, mbin_rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func + cmovne mbin_rsi, mbin_rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _%1_init_done + lea mbin_rsi, [%2 WRT_OPT] + + _%1_init_done: + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init2 parameters +; Cases where only base functions are available +; 1-> function name +; 2-> base function +;;;;; +%macro mbin_dispatch_init2 2 + section .text + %1_dispatch_init: + push mbin_rsi + lea mbin_rsi, [%2 WRT_OPT] ; Default + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init_clmul 3 parameters +; Use this case for CRC which needs both SSE4_1 and CLMUL +; 1-> function name +; 2-> base function +; 3-> SSE4_1 and CLMUL optimized function +; 4-> AVX/02 opt func +; 5-> AVX512/10 opt func +;;;;; +%macro mbin_dispatch_init_clmul 5 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_1 + jz _%1_init_done + test ecx, FLAG_CPUID1_ECX_CLMUL + jz _%1_init_done + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + +%if AS_FEATURE_LEVEL >= 10 + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + jne _%1_init_done + + and ecx, FLAGS_CPUID7_ECX_AVX512_G2 + cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 + lea mbin_rbx, [%5 WRT_OPT] ; AVX512/10 opt + cmove mbin_rsi, mbin_rbx +%endif + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init5 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_2 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +;;;;; +%macro mbin_dispatch_init5 5 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + ; Test for SSE4.2 + test ecx, FLAG_CPUID1_ECX_SSE4_2 + lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func + cmovne mbin_rsi, mbin_rbx + + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func + jne _%1_init_done ; AVX is not available so end + mov mbin_rsi, mbin_rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func + cmovne mbin_rsi, mbin_rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _%1_init_done + lea mbin_rsi, [%3 WRT_OPT] + + _%1_init_done: + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +%if AS_FEATURE_LEVEL >= 6 +;;;;; +; mbin_dispatch_init6 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_2 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +;;;;; +%macro mbin_dispatch_init6 6 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_2 + je _%1_init_done ; Use base function if no SSE4_2 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +%else +%macro mbin_dispatch_init6 6 + mbin_dispatch_init5 %1, %2, %3, %4, %5 +%endmacro +%endif + +%if AS_FEATURE_LEVEL >= 10 +;;;;; +; mbin_dispatch_init7 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_2 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +; 7-> AVX512 Update/10 opt func +;;;;; +%macro mbin_dispatch_init7 7 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_2 + je _%1_init_done ; Use base function if no SSE4_2 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + and ecx, FLAGS_CPUID7_ECX_AVX512_G2 + cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 + lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro +%else +%macro mbin_dispatch_init7 7 + mbin_dispatch_init6 %1, %2, %3, %4, %5, %6 +%endmacro +%endif + +%endif ; ifndef _MULTIBINARY_ASM_ diff --git a/src/isa-l/include/raid.h b/src/isa-l/include/raid.h new file mode 100644 index 000000000..6100a4824 --- /dev/null +++ b/src/isa-l/include/raid.h @@ -0,0 +1,305 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#ifndef _RAID_H_ +#define _RAID_H_ + +/** + * @file raid.h + * @brief Interface to RAID functions - XOR and P+Q calculation. + * + * This file defines the interface to optimized XOR calculation (RAID5) or P+Q + * dual parity (RAID6). Operations are carried out on an array of pointers to + * sources and output arrays. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Multi-binary functions */ + +/** + * @brief Generate XOR parity vector from N sources, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest + * pointers must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int xor_gen(int vects, int len, void **array); + + +/** + * @brief Checks that array has XOR parity sum of 0 across all vectors, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to vectors. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_check(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 32B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int pq_gen(int vects, int len, void **array); + + +/** + * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of vectors in array including P&Q. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and P, Q. P and Q parity + * are assumed to be the last two pointers in the array. + * All pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_check(int vects, int len, void **array); + + +/* Arch specific versions */ +// x86 only +#if defined(__i386__) || defined(__x86_64__) + +/** + * @brief Generate XOR parity vector from N sources. + * @requires SSE4.1 + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_gen_sse(int vects, int len, void **array); + + +/** + * @brief Generate XOR parity vector from N sources. + * @requires AVX + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest pointers + * must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int xor_gen_avx(int vects, int len, void **array); + + +/** + * @brief Checks that array has XOR parity sum of 0 across all vectors. + * @requires SSE4.1 + * + * @param vects Number of vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to vectors. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_check_sse(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources. + * @requires SSE4.1 + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_sse(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources. + * @requires AVX + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_avx(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources. + * @requires AVX2 + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 32B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_avx2(int vects, int len, void **array); + + +/** + * @brief Checks that array of N sources, P and Q are consistent across all vectors. + * @requires SSE4.1 + * + * @param vects Number of vectors in array including P&Q. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and P, Q. P and Q parity + are assumed to be the last two pointers in the array. + All pointers must be aligned to 16B. + * @returns 0 pass, other fail + */ + +int pq_check_sse(int vects, int len, void **array); + +#endif + +/** + * @brief Generate P+Q parity vectors from N sources, runs baseline version. + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_base(int vects, int len, void **array); + + +/** + * @brief Generate XOR parity vector from N sources, runs baseline version. + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest pointers + * must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int xor_gen_base(int vects, int len, void **array); + + +/** + * @brief Checks that array has XOR parity sum of 0 across all vectors, runs baseline version. + * + * @param vects Number of vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to vectors. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_check_base(int vects, int len, void **array); + + +/** + * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs baseline version. + * + * @param vects Number of vectors in array including P&Q. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and P, Q. P and Q parity + * are assumed to be the last two pointers in the array. + * All pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_check_base(int vects, int len, void **array); + +#ifdef __cplusplus +} +#endif + +#endif //_RAID_H_ diff --git a/src/isa-l/include/reg_sizes.asm b/src/isa-l/include/reg_sizes.asm new file mode 100644 index 000000000..b7ad842d8 --- /dev/null +++ b/src/isa-l/include/reg_sizes.asm @@ -0,0 +1,291 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _REG_SIZES_ASM_ +%define _REG_SIZES_ASM_ + +%ifndef AS_FEATURE_LEVEL +%define AS_FEATURE_LEVEL 4 +%endif + +%define EFLAGS_HAS_CPUID (1<<21) +%define FLAG_CPUID1_ECX_CLMUL (1<<1) +%define FLAG_CPUID1_EDX_SSE2 (1<<26) +%define FLAG_CPUID1_ECX_SSE3 (1) +%define FLAG_CPUID1_ECX_SSE4_1 (1<<19) +%define FLAG_CPUID1_ECX_SSE4_2 (1<<20) +%define FLAG_CPUID1_ECX_POPCNT (1<<23) +%define FLAG_CPUID1_ECX_AESNI (1<<25) +%define FLAG_CPUID1_ECX_OSXSAVE (1<<27) +%define FLAG_CPUID1_ECX_AVX (1<<28) +%define FLAG_CPUID1_EBX_AVX2 (1<<5) + +%define FLAG_CPUID7_EBX_AVX2 (1<<5) +%define FLAG_CPUID7_EBX_AVX512F (1<<16) +%define FLAG_CPUID7_EBX_AVX512DQ (1<<17) +%define FLAG_CPUID7_EBX_AVX512IFMA (1<<21) +%define FLAG_CPUID7_EBX_AVX512PF (1<<26) +%define FLAG_CPUID7_EBX_AVX512ER (1<<27) +%define FLAG_CPUID7_EBX_AVX512CD (1<<28) +%define FLAG_CPUID7_EBX_AVX512BW (1<<30) +%define FLAG_CPUID7_EBX_AVX512VL (1<<31) + +%define FLAG_CPUID7_ECX_AVX512VBMI (1<<1) +%define FLAG_CPUID7_ECX_AVX512VBMI2 (1 << 6) +%define FLAG_CPUID7_ECX_GFNI (1 << 8) +%define FLAG_CPUID7_ECX_VAES (1 << 9) +%define FLAG_CPUID7_ECX_VPCLMULQDQ (1 << 10) +%define FLAG_CPUID7_ECX_VNNI (1 << 11) +%define FLAG_CPUID7_ECX_BITALG (1 << 12) +%define FLAG_CPUID7_ECX_VPOPCNTDQ (1 << 14) + +%define FLAGS_CPUID7_EBX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ) +%define FLAGS_CPUID7_ECX_AVX512_G2 (FLAG_CPUID7_ECX_AVX512VBMI2 | FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ | FLAG_CPUID7_ECX_VNNI | FLAG_CPUID7_ECX_BITALG | FLAG_CPUID7_ECX_VPOPCNTDQ) + +%define FLAG_XGETBV_EAX_XMM (1<<1) +%define FLAG_XGETBV_EAX_YMM (1<<2) +%define FLAG_XGETBV_EAX_XMM_YMM 0x6 +%define FLAG_XGETBV_EAX_ZMM_OPM 0xe0 + +%define FLAG_CPUID1_EAX_AVOTON 0x000406d0 +%define FLAG_CPUID1_EAX_STEP_MASK 0xfffffff0 + +; define d and w variants for registers + +%define raxd eax +%define raxw ax +%define raxb al + +%define rbxd ebx +%define rbxw bx +%define rbxb bl + +%define rcxd ecx +%define rcxw cx +%define rcxb cl + +%define rdxd edx +%define rdxw dx +%define rdxb dl + +%define rsid esi +%define rsiw si +%define rsib sil + +%define rdid edi +%define rdiw di +%define rdib dil + +%define rbpd ebp +%define rbpw bp +%define rbpb bpl + +%define ymm0x xmm0 +%define ymm1x xmm1 +%define ymm2x xmm2 +%define ymm3x xmm3 +%define ymm4x xmm4 +%define ymm5x xmm5 +%define ymm6x xmm6 +%define ymm7x xmm7 +%define ymm8x xmm8 +%define ymm9x xmm9 +%define ymm10x xmm10 +%define ymm11x xmm11 +%define ymm12x xmm12 +%define ymm13x xmm13 +%define ymm14x xmm14 +%define ymm15x xmm15 + +%define zmm0x xmm0 +%define zmm1x xmm1 +%define zmm2x xmm2 +%define zmm3x xmm3 +%define zmm4x xmm4 +%define zmm5x xmm5 +%define zmm6x xmm6 +%define zmm7x xmm7 +%define zmm8x xmm8 +%define zmm9x xmm9 +%define zmm10x xmm10 +%define zmm11x xmm11 +%define zmm12x xmm12 +%define zmm13x xmm13 +%define zmm14x xmm14 +%define zmm15x xmm15 +%define zmm16x xmm16 +%define zmm17x xmm17 +%define zmm18x xmm18 +%define zmm19x xmm19 +%define zmm20x xmm20 +%define zmm21x xmm21 +%define zmm22x xmm22 +%define zmm23x xmm23 +%define zmm24x xmm24 +%define zmm25x xmm25 +%define zmm26x xmm26 +%define zmm27x xmm27 +%define zmm28x xmm28 +%define zmm29x xmm29 +%define zmm30x xmm30 +%define zmm31x xmm31 + +%define zmm0y ymm0 +%define zmm1y ymm1 +%define zmm2y ymm2 +%define zmm3y ymm3 +%define zmm4y ymm4 +%define zmm5y ymm5 +%define zmm6y ymm6 +%define zmm7y ymm7 +%define zmm8y ymm8 +%define zmm9y ymm9 +%define zmm10y ymm10 +%define zmm11y ymm11 +%define zmm12y ymm12 +%define zmm13y ymm13 +%define zmm14y ymm14 +%define zmm15y ymm15 +%define zmm16y ymm16 +%define zmm17y ymm17 +%define zmm18y ymm18 +%define zmm19y ymm19 +%define zmm20y ymm20 +%define zmm21y ymm21 +%define zmm22y ymm22 +%define zmm23y ymm23 +%define zmm24y ymm24 +%define zmm25y ymm25 +%define zmm26y ymm26 +%define zmm27y ymm27 +%define zmm28y ymm28 +%define zmm29y ymm29 +%define zmm30y ymm30 +%define zmm31y ymm31 + +%define DWORD(reg) reg %+ d +%define WORD(reg) reg %+ w +%define BYTE(reg) reg %+ b + +%define XWORD(reg) reg %+ x + +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__,elf64 + %define __x86_64__ +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__,win64 + %define __x86_64__ +%endif +%ifidn __OUTPUT_FORMAT__,macho64 + %define __x86_64__ +%endif + +%ifdef __x86_64__ + %define endbranch db 0xf3, 0x0f, 0x1e, 0xfa +%else + %define endbranch db 0xf3, 0x0f, 0x1e, 0xfb +%endif + +%ifdef REL_TEXT + %define WRT_OPT +%elifidn __OUTPUT_FORMAT__, elf64 + %define WRT_OPT wrt ..plt +%else + %define WRT_OPT +%endif + +%macro mk_global 1-3 + %ifdef __NASM_VER__ + %ifidn __OUTPUT_FORMAT__, macho64 + global %1 + %elifidn __OUTPUT_FORMAT__, win64 + global %1 + %else + global %1:%2 %3 + %endif + %else + global %1:%2 %3 + %endif +%endmacro + + +; Fixes for nasm lack of MS proc helpers +%ifdef __NASM_VER__ + %ifidn __OUTPUT_FORMAT__, win64 + %macro alloc_stack 1 + sub rsp, %1 + %endmacro + + %macro proc_frame 1 + %1: + %endmacro + + %macro save_xmm128 2 + movdqa [rsp + %2], %1 + %endmacro + + %macro save_reg 2 + mov [rsp + %2], %1 + %endmacro + + %macro rex_push_reg 1 + push %1 + %endmacro + + %macro push_reg 1 + push %1 + %endmacro + + %define end_prolog + %endif + + %define endproc_frame +%endif + +%ifidn __OUTPUT_FORMAT__, macho64 + %define elf64 macho64 + mac_equ equ 1 +%endif + +%macro slversion 4 + section .text + global %1_slver_%2%3%4 + global %1_slver + %1_slver: + %1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro + +%endif ; ifndef _REG_SIZES_ASM_ diff --git a/src/isa-l/include/test.h b/src/isa-l/include/test.h new file mode 100644 index 000000000..31ccc67b9 --- /dev/null +++ b/src/isa-l/include/test.h @@ -0,0 +1,285 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _TEST_H +#define _TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdio.h> +#include <stdint.h> + +#ifdef _MSC_VER +# define inline __inline +#endif + +/* Decide wether to use benchmark time as an approximation or a minimum. Fewer + * calls to the timer are required for the approximation case.*/ +#define BENCHMARK_MIN_TIME 0 +#define BENCHMARK_APPROX_TIME 1 +#ifndef BENCHMARK_TYPE +#define BENCHMARK_TYPE BENCHMARK_MIN_TIME +#endif + +#ifdef USE_RDTSC +/* The use of rtdsc is nuanced. On many processors it corresponds to a + * standardized clock source. To obtain a meaningful result it may be + * necessary to fix the CPU clock to match the rtdsc tick rate. + */ +# include <inttypes.h> +# include <x86intrin.h> +# define USE_CYCLES +#else +# include <time.h> +#define USE_SECONDS +#endif + +#ifdef USE_RDTSC +#ifndef BENCHMARK_TIME +# define BENCHMARK_TIME 6 +#endif +# define GHZ 1000000000 +# define UNIT_SCALE (GHZ) +# define CALLIBRATE_TIME (UNIT_SCALE / 2) +static inline long long get_time(void) { + unsigned int dummy; + return __rdtscp(&dummy); +} + +static inline long long get_res(void) { + return 1; +} +#else +#ifndef BENCHMARK_TIME +# define BENCHMARK_TIME 3 +#endif +#ifdef _MSC_VER +#define UNIT_SCALE get_res() +#define CALLIBRATE_TIME (UNIT_SCALE / 4) +static inline long long get_time(void) { + long long ret = 0; + QueryPerformanceCounter(&ret); + return ret; +} + +static inline long long get_res(void) { + long long ret = 0; + QueryPerformanceFrequency(&ret); + return ret; +} +#else +# define NANO_SCALE 1000000000 +# define UNIT_SCALE NANO_SCALE +# define CALLIBRATE_TIME (UNIT_SCALE / 4) +#ifdef __FreeBSD__ +# define CLOCK_ID CLOCK_MONOTONIC_PRECISE +#else +# define CLOCK_ID CLOCK_MONOTONIC +#endif + +static inline long long get_time(void) { + struct timespec time; + long long nano_total; + clock_gettime(CLOCK_ID, &time); + nano_total = time.tv_sec; + nano_total *= NANO_SCALE; + nano_total += time.tv_nsec; + return nano_total; +} + +static inline long long get_res(void) { + struct timespec time; + long long nano_total; + clock_getres(CLOCK_ID, &time); + nano_total = time.tv_sec; + nano_total *= NANO_SCALE; + nano_total += time.tv_nsec; + return nano_total; +} +#endif +#endif +struct perf { + long long start; + long long stop; + long long run_total; + long long iterations; +}; + +static inline void perf_init(struct perf *p) { + p->start = 0; + p->stop = 0; + p->run_total = 0; +} + +static inline void perf_continue(struct perf *p) { + p->start = get_time(); +} + +static inline void perf_pause(struct perf *p) { + p->stop = get_time(); + p->run_total = p->run_total + p->stop - p->start; + p->start = p->stop; +} + +static inline void perf_start(struct perf *p) { + perf_init(p); + perf_continue(p); +} + +static inline void perf_stop(struct perf *p) { + perf_pause(p); +} + +static inline double get_time_elapsed(struct perf *p) { + return 1.0 * p->run_total / UNIT_SCALE; +} + +static inline long long get_base_elapsed(struct perf *p) { + return p->run_total; +} + +static inline unsigned long long estimate_perf_iterations(struct perf *p, + unsigned long long runs, + unsigned long long total) { + total = total * runs; + if (get_base_elapsed(p) > 0) + return (total + get_base_elapsed(p) - 1) / get_base_elapsed(p); + else + return (total + get_res() - 1) / get_res(); +} + +#define CALLIBRATE(PERF, FUNC_CALL) { \ + unsigned long long _i, _iter = 1; \ + perf_start(PERF); \ + FUNC_CALL; \ + perf_pause(PERF); \ + \ + while (get_base_elapsed(PERF) < CALLIBRATE_TIME) { \ + _iter = estimate_perf_iterations(PERF, _iter, \ + 2 * CALLIBRATE_TIME); \ + perf_start(PERF); \ + for (_i = 0; _i < _iter; _i++) { \ + FUNC_CALL; \ + } \ + perf_stop(PERF); \ + } \ + (PERF)->iterations=_iter; \ +} + +#define PERFORMANCE_TEST(PERF, RUN_TIME, FUNC_CALL) { \ + unsigned long long _i, _iter = (PERF)->iterations; \ + unsigned long long _run_total = RUN_TIME; \ + _run_total *= UNIT_SCALE; \ + _iter = estimate_perf_iterations(PERF, _iter, _run_total);\ + (PERF)->iterations = 0; \ + perf_start(PERF); \ + for (_i = 0; _i < _iter; _i++) { \ + FUNC_CALL; \ + } \ + perf_pause(PERF); \ + (PERF)->iterations += _iter; \ + \ + if(get_base_elapsed(PERF) < _run_total && \ + BENCHMARK_TYPE == BENCHMARK_MIN_TIME) { \ + _iter = estimate_perf_iterations(PERF, _iter, \ + _run_total - get_base_elapsed(PERF) + \ + (UNIT_SCALE / 16)); \ + perf_continue(PERF); \ + for (_i = 0; _i < _iter; _i++) { \ + FUNC_CALL; \ + } \ + perf_pause(PERF); \ + (PERF)->iterations += _iter; \ + } \ +} + +#define BENCHMARK(PERF, RUN_TIME, FUNC_CALL) { \ + if((RUN_TIME) > 0) { \ + CALLIBRATE(PERF, FUNC_CALL); \ + PERFORMANCE_TEST(PERF, RUN_TIME, FUNC_CALL); \ + \ + } else { \ + (PERF)->iterations = 1; \ + perf_start(PERF); \ + FUNC_CALL; \ + perf_stop(PERF); \ + } \ +} + +#ifdef USE_CYCLES +static inline void perf_print(struct perf p, long long unit_count) { + long long total_units = p.iterations * unit_count; + + printf("runtime = %10lld ticks", get_base_elapsed(&p)); + if (total_units != 0) { + printf(", bandwidth %lld MB in %.4f GC = %.2f ticks/byte", + total_units / (1000000), get_time_elapsed(&p), + get_base_elapsed(&p) / (double)total_units); + } + printf("\n"); +} +#else +static inline void perf_print(struct perf p, double unit_count) { + long long total_units = p.iterations * unit_count; + long long usecs = (long long)(get_time_elapsed(&p) * 1000000); + + printf("runtime = %10lld usecs", usecs); + if (total_units != 0) { + printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s", + total_units / (1000000), get_time_elapsed(&p), + ((double)total_units) / (1000000 * get_time_elapsed(&p))); + } + printf("\n"); +} +#endif + +static inline uint64_t get_filesize(FILE * fp) { + uint64_t file_size; + fpos_t pos, pos_curr; + + fgetpos(fp, &pos_curr); /* Save current position */ +#if defined(_WIN32) || defined(_WIN64) + _fseeki64(fp, 0, SEEK_END); +#else + fseeko(fp, 0, SEEK_END); +#endif + fgetpos(fp, &pos); + file_size = *(uint64_t *) & pos; + fsetpos(fp, &pos_curr); /* Restore position */ + + return file_size; +} + +#ifdef __cplusplus +} +#endif + +#endif // _TEST_H diff --git a/src/isa-l/include/types.h b/src/isa-l/include/types.h new file mode 100644 index 000000000..531c79724 --- /dev/null +++ b/src/isa-l/include/types.h @@ -0,0 +1,77 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +/** + * @file types.h + * @brief Defines standard width types. + * + */ + +#ifndef __TYPES_H +#define __TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 +#ifdef __MINGW32__ +# include <_mingw.h> +#endif +#endif + + +#if defined __unix__ || defined __APPLE__ +# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval))) +# define __forceinline static inline +# define aligned_free(x) free(x) +#else +# ifdef __MINGW32__ +# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval))) +# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn))) +# define aligned_free(x) _aligned_free(x) +# else +# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl +# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn))) +# define aligned_free(x) _aligned_free(x) +# endif +#endif + +#ifdef DEBUG +# define DEBUG_PRINT(x) printf x +#else +# define DEBUG_PRINT(x) do {} while (0) +#endif + +#ifdef __cplusplus +} +#endif + +#endif //__TYPES_H diff --git a/src/isa-l/include/unaligned.h b/src/isa-l/include/unaligned.h new file mode 100644 index 000000000..f7b1ed88e --- /dev/null +++ b/src/isa-l/include/unaligned.h @@ -0,0 +1,76 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef UNALIGNED_H +#define UNALIGNED_H + +#include "stdint.h" +#include "string.h" + +static inline uint16_t load_u16(uint8_t * buf) { + uint16_t ret; + memcpy(&ret, buf, sizeof(ret)); + return ret; +} + +static inline uint32_t load_u32(uint8_t * buf) { + uint32_t ret; + memcpy(&ret, buf, sizeof(ret)); + return ret; +} + +static inline uint64_t load_u64(uint8_t * buf) { + uint64_t ret; + memcpy(&ret, buf, sizeof(ret)); + return ret; +} + +static inline uintmax_t load_umax(uint8_t * buf) { + uintmax_t ret; + memcpy(&ret, buf, sizeof(ret)); + return ret; +} + +static inline void store_u16(uint8_t * buf, uint16_t val) { + memcpy(buf, &val, sizeof(val)); +} + +static inline void store_u32(uint8_t * buf, uint32_t val) { + memcpy(buf, &val, sizeof(val)); +} + +static inline void store_u64(uint8_t * buf, uint64_t val) { + memcpy(buf, &val, sizeof(val)); +} + +static inline void store_umax(uint8_t * buf, uintmax_t val) { + memcpy(buf, &val, sizeof(val)); +} + +#endif |