diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-15 09:41:35 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-15 09:41:35 +0000 |
commit | 2ed1dcfa30b3967f7d6df74fba78ce23ed065497 (patch) | |
tree | 8ff5a74b07bf976cd88df2460e1c9cafb27f050a /src/liblzma/check | |
parent | Releasing progress-linux version 5.6.1+really5.4.5-1~progress7.99u1. (diff) | |
download | xz-utils-2ed1dcfa30b3967f7d6df74fba78ce23ed065497.tar.xz xz-utils-2ed1dcfa30b3967f7d6df74fba78ce23ed065497.zip |
Merging upstream version 5.6.2.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/liblzma/check')
-rw-r--r-- | src/liblzma/check/Makefile.inc | 15 | ||||
-rw-r--r-- | src/liblzma/check/check.c | 5 | ||||
-rw-r--r-- | src/liblzma/check/check.h | 5 | ||||
-rw-r--r-- | src/liblzma/check/crc32_arm64.h | 122 | ||||
-rw-r--r-- | src/liblzma/check/crc32_fast.c | 158 | ||||
-rw-r--r-- | src/liblzma/check/crc32_small.c | 5 | ||||
-rw-r--r-- | src/liblzma/check/crc32_table.c | 34 | ||||
-rw-r--r-- | src/liblzma/check/crc32_table_be.h | 4 | ||||
-rw-r--r-- | src/liblzma/check/crc32_table_le.h | 4 | ||||
-rw-r--r-- | src/liblzma/check/crc32_tablegen.c | 23 | ||||
-rw-r--r-- | src/liblzma/check/crc32_x86.S | 5 | ||||
-rw-r--r-- | src/liblzma/check/crc64_fast.c | 446 | ||||
-rw-r--r-- | src/liblzma/check/crc64_small.c | 5 | ||||
-rw-r--r-- | src/liblzma/check/crc64_table.c | 16 | ||||
-rw-r--r-- | src/liblzma/check/crc64_table_be.h | 4 | ||||
-rw-r--r-- | src/liblzma/check/crc64_table_le.h | 4 | ||||
-rw-r--r-- | src/liblzma/check/crc64_tablegen.c | 15 | ||||
-rw-r--r-- | src/liblzma/check/crc64_x86.S | 5 | ||||
-rw-r--r-- | src/liblzma/check/crc_common.h | 137 | ||||
-rw-r--r-- | src/liblzma/check/crc_macros.h | 30 | ||||
-rw-r--r-- | src/liblzma/check/crc_x86_clmul.h | 428 | ||||
-rw-r--r-- | src/liblzma/check/sha256.c | 19 |
22 files changed, 954 insertions, 535 deletions
diff --git a/src/liblzma/check/Makefile.inc b/src/liblzma/check/Makefile.inc index dc011a3..04cf1ff 100644 --- a/src/liblzma/check/Makefile.inc +++ b/src/liblzma/check/Makefile.inc @@ -1,9 +1,8 @@ -## +## SPDX-License-Identifier: 0BSD ## Author: Lasse Collin -## -## This file has been put into the public domain. -## You can do whatever you want with this file. -## + +## Note: There is no check for COND_CHECK_CRC32 because +## currently crc32 is always enabled. EXTRA_DIST += \ check/crc32_tablegen.c \ @@ -12,9 +11,10 @@ EXTRA_DIST += \ liblzma_la_SOURCES += \ check/check.c \ check/check.h \ - check/crc_macros.h + check/crc_common.h \ + check/crc_x86_clmul.h \ + check/crc32_arm64.h -if COND_CHECK_CRC32 if COND_SMALL liblzma_la_SOURCES += check/crc32_small.c else @@ -28,7 +28,6 @@ else liblzma_la_SOURCES += check/crc32_fast.c endif endif -endif if COND_CHECK_CRC64 if COND_SMALL diff --git a/src/liblzma/check/check.c b/src/liblzma/check/check.c index 428ddae..7734ace 100644 --- a/src/liblzma/check/check.c +++ b/src/liblzma/check/check.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file check.c @@ -5,9 +7,6 @@ // // Author: Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #include "check.h" diff --git a/src/liblzma/check/check.h b/src/liblzma/check/check.h index 8ae95d5..f0eb117 100644 --- a/src/liblzma/check/check.h +++ b/src/liblzma/check/check.h @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file check.h @@ -5,9 +7,6 @@ // // Author: Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #ifndef LZMA_CHECK_H diff --git a/src/liblzma/check/crc32_arm64.h b/src/liblzma/check/crc32_arm64.h new file mode 100644 index 0000000..39c1c63 --- /dev/null +++ b/src/liblzma/check/crc32_arm64.h @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file crc32_arm64.h +/// \brief CRC32 calculation with ARM64 optimization +// +// Authors: Chenxi Mao +// Jia Tan +// Hans Jansen +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef LZMA_CRC32_ARM64_H +#define LZMA_CRC32_ARM64_H + +// MSVC always has the CRC intrinsics available when building for ARM64 +// there is no need to include any header files. +#ifndef _MSC_VER +# include <arm_acle.h> +#endif + +// If both versions are going to be built, we need runtime detection +// to check if the instructions are supported. +#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) +# if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO) +# include <sys/auxv.h> +# elif defined(_WIN32) +# include <processthreadsapi.h> +# elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME) +# include <sys/sysctl.h> +# endif +#endif + +// Some EDG-based compilers support ARM64 and define __GNUC__ +// (such as Nvidia's nvcc), but do not support function attributes. +// +// NOTE: Build systems check for this too, keep them in sync with this. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) +# define crc_attr_target __attribute__((__target__("+crc"))) +#else +# define crc_attr_target +#endif + + +crc_attr_target +static uint32_t +crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc) +{ + crc = ~crc; + + // Align the input buffer because this was shown to be + // significantly faster than unaligned accesses. + const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7); + + for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf) + crc = __crc32b(crc, *buf); + + size -= align_amount; + + // Process 8 bytes at a time. The end point is determined by + // ignoring the least significant three bits of size to ensure + // we do not process past the bounds of the buffer. This guarantees + // that limit is a multiple of 8 and is strictly less than size. + for (const uint8_t *limit = buf + (size & ~(size_t)7); + buf < limit; buf += 8) + crc = __crc32d(crc, aligned_read64le(buf)); + + // Process the remaining bytes that are not 8 byte aligned. + for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf) + crc = __crc32b(crc, *buf); + + return ~crc; +} + + +#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) +static inline bool +is_arch_extension_supported(void) +{ +#if defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0; + +#elif defined(HAVE_ELF_AUX_INFO) + unsigned long feature_flags; + + if (elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags)) != 0) + return false; + + return (feature_flags & HWCAP_CRC32) != 0; + +#elif defined(_WIN32) + return IsProcessorFeaturePresent( + PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); + +#elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME) + int has_crc32 = 0; + size_t size = sizeof(has_crc32); + + // The sysctlbyname() function requires a string identifier for the + // CPU feature it tests. The Apple documentation lists the string + // "hw.optional.armv8_crc32", which can be found here: + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619 + if (sysctlbyname("hw.optional.armv8_crc32", &has_crc32, + &size, NULL, 0) != 0) + return false; + + return has_crc32; + +#else + // If a runtime detection method cannot be found, then this must + // be a compile time error. The checks in crc_common.h should ensure + // a runtime detection method is always found if this function is + // built. It would be possible to just return false here, but this + // is inefficient for binary size and runtime since only the generic + // method could ever be used. +# error Runtime detection method unavailable. +#endif +} +#endif + +#endif // LZMA_CRC32_ARM64_H diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c index eed7350..16dbb74 100644 --- a/src/liblzma/check/crc32_fast.c +++ b/src/liblzma/check/crc32_fast.c @@ -1,35 +1,40 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file crc32.c /// \brief CRC32 calculation -/// -/// Calculate the CRC32 using the slice-by-eight algorithm. -/// It is explained in this document: -/// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf -/// The code in this file is not the same as in Intel's paper, but -/// the basic principle is identical. -// -// Author: Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. +// Authors: Lasse Collin +// Ilya Kurdyukov +// Hans Jansen // /////////////////////////////////////////////////////////////////////////////// #include "check.h" -#include "crc_macros.h" +#include "crc_common.h" + +#if defined(CRC_X86_CLMUL) +# define BUILDING_CRC32_CLMUL +# include "crc_x86_clmul.h" +#elif defined(CRC32_ARM64) +# include "crc32_arm64.h" +#endif -// If you make any changes, do some benchmarking! Seemingly unrelated -// changes can very easily ruin the performance (and very probably is -// very compiler dependent). -extern LZMA_API(uint32_t) -lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc) +#ifdef CRC32_GENERIC + +/////////////////// +// Generic CRC32 // +/////////////////// + +static uint32_t +crc32_generic(const uint8_t *buf, size_t size, uint32_t crc) { crc = ~crc; #ifdef WORDS_BIGENDIAN - crc = bswap32(crc); + crc = byteswap32(crc); #endif if (size > 8) { @@ -75,8 +80,125 @@ lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc) crc = lzma_crc32_table[0][*buf++ ^ A(crc)] ^ S8(crc); #ifdef WORDS_BIGENDIAN - crc = bswap32(crc); + crc = byteswap32(crc); #endif return ~crc; } +#endif + + +#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) + +////////////////////////// +// Function dispatching // +////////////////////////// + +// If both the generic and arch-optimized implementations are built, then +// the function to use is selected at runtime because the system running +// the binary might not have the arch-specific instruction set extension(s) +// available. The dispatch methods in order of priority: +// +// 1. Constructor. This method uses __attribute__((__constructor__)) to +// set crc32_func at load time. This avoids extra computation (and any +// unlikely threading bugs) on the first call to lzma_crc32() to decide +// which implementation should be used. +// +// 2. First Call Resolution. On the very first call to lzma_crc32(), the +// call will be directed to crc32_dispatch() instead. This will set the +// appropriate implementation function and will not be called again. +// This method does not use any kind of locking but is safe because if +// multiple threads run the dispatcher simultaneously then they will all +// set crc32_func to the same value. + +typedef uint32_t (*crc32_func_type)( + const uint8_t *buf, size_t size, uint32_t crc); + +// This resolver is shared between all dispatch methods. +static crc32_func_type +crc32_resolve(void) +{ + return is_arch_extension_supported() + ? &crc32_arch_optimized : &crc32_generic; +} + + +#ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR +// Constructor method. +# define CRC32_SET_FUNC_ATTR __attribute__((__constructor__)) +static crc32_func_type crc32_func; +#else +// First Call Resolution method. +# define CRC32_SET_FUNC_ATTR +static uint32_t crc32_dispatch(const uint8_t *buf, size_t size, uint32_t crc); +static crc32_func_type crc32_func = &crc32_dispatch; +#endif + +CRC32_SET_FUNC_ATTR +static void +crc32_set_func(void) +{ + crc32_func = crc32_resolve(); + return; +} + +#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR +static uint32_t +crc32_dispatch(const uint8_t *buf, size_t size, uint32_t crc) +{ + // When __attribute__((__constructor__)) isn't supported, set the + // function pointer without any locking. If multiple threads run + // the detection code in parallel, they will all end up setting + // the pointer to the same value. This avoids the use of + // mythread_once() on every call to lzma_crc32() but this likely + // isn't strictly standards compliant. Let's change it if it breaks. + crc32_set_func(); + return crc32_func(buf, size, crc); +} + +#endif +#endif + + +extern LZMA_API(uint32_t) +lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc) +{ +#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) + // On x86-64, if CLMUL is available, it is the best for non-tiny + // inputs, being over twice as fast as the generic slice-by-four + // version. However, for size <= 16 it's different. In the extreme + // case of size == 1 the generic version can be five times faster. + // At size >= 8 the CLMUL starts to become reasonable. It + // varies depending on the alignment of buf too. + // + // The above doesn't include the overhead of mythread_once(). + // At least on x86-64 GNU/Linux, pthread_once() is very fast but + // it still makes lzma_crc32(buf, 1, crc) 50-100 % slower. When + // size reaches 12-16 bytes the overhead becomes negligible. + // + // So using the generic version for size <= 16 may give better + // performance with tiny inputs but if such inputs happen rarely + // it's not so obvious because then the lookup table of the + // generic version may not be in the processor cache. +#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS + if (size <= 16) + return crc32_generic(buf, size, crc); +#endif + +/* +#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR + // See crc32_dispatch(). This would be the alternative which uses + // locking and doesn't use crc32_dispatch(). Note that on Windows + // this method needs Vista threads. + mythread_once(crc64_set_func); +#endif +*/ + return crc32_func(buf, size, crc); + +#elif defined(CRC32_ARCH_OPTIMIZED) + return crc32_arch_optimized(buf, size, crc); + +#else + return crc32_generic(buf, size, crc); +#endif +} diff --git a/src/liblzma/check/crc32_small.c b/src/liblzma/check/crc32_small.c index 186966e..6a1bd66 100644 --- a/src/liblzma/check/crc32_small.c +++ b/src/liblzma/check/crc32_small.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file crc32_small.c @@ -5,9 +7,6 @@ // // Author: Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #include "check.h" diff --git a/src/liblzma/check/crc32_table.c b/src/liblzma/check/crc32_table.c index b11762a..c141cef 100644 --- a/src/liblzma/check/crc32_table.c +++ b/src/liblzma/check/crc32_table.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file crc32_table.c @@ -5,18 +7,36 @@ // // Author: Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #include "common.h" + +// FIXME: Compared to crc_common.h this has to check for __x86_64__ too +// so that in 32-bit builds crc32_x86.S won't break due to a missing table. +#if defined(HAVE_USABLE_CLMUL) && ((defined(__x86_64__) && defined(__SSSE3__) \ + && defined(__SSE4_1__) && defined(__PCLMUL__)) \ + || (defined(__e2k__) && __iset__ >= 6)) +# define NO_CRC32_TABLE + +#elif defined(HAVE_ARM64_CRC32) \ + && !defined(WORDS_BIGENDIAN) \ + && defined(__ARM_FEATURE_CRC32) +# define NO_CRC32_TABLE +#endif + + +#if !defined(HAVE_ENCODERS) && defined(NO_CRC32_TABLE) +// No table needed. Use a typedef to avoid an empty translation unit. +typedef void lzma_crc32_dummy; + +#else // Having the declaration here silences clang -Wmissing-variable-declarations. extern const uint32_t lzma_crc32_table[8][256]; -#ifdef WORDS_BIGENDIAN -# include "crc32_table_be.h" -#else -# include "crc32_table_le.h" +# ifdef WORDS_BIGENDIAN +# include "crc32_table_be.h" +# else +# include "crc32_table_le.h" +# endif #endif diff --git a/src/liblzma/check/crc32_table_be.h b/src/liblzma/check/crc32_table_be.h index c483cb6..505c230 100644 --- a/src/liblzma/check/crc32_table_be.h +++ b/src/liblzma/check/crc32_table_be.h @@ -1,4 +1,6 @@ -/* This file has been automatically generated by crc32_tablegen.c. */ +// SPDX-License-Identifier: 0BSD + +// This file has been generated by crc32_tablegen.c. const uint32_t lzma_crc32_table[8][256] = { { diff --git a/src/liblzma/check/crc32_table_le.h b/src/liblzma/check/crc32_table_le.h index 25f4fc4..e89c21a 100644 --- a/src/liblzma/check/crc32_table_le.h +++ b/src/liblzma/check/crc32_table_le.h @@ -1,4 +1,6 @@ -/* This file has been automatically generated by crc32_tablegen.c. */ +// SPDX-License-Identifier: 0BSD + +// This file has been generated by crc32_tablegen.c. const uint32_t lzma_crc32_table[8][256] = { { diff --git a/src/liblzma/check/crc32_tablegen.c b/src/liblzma/check/crc32_tablegen.c index 31a4d27..b8cf459 100644 --- a/src/liblzma/check/crc32_tablegen.c +++ b/src/liblzma/check/crc32_tablegen.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file crc32_tablegen.c @@ -9,9 +11,6 @@ // // Author: Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #include <stdio.h> @@ -44,7 +43,7 @@ init_crc32_table(void) #ifdef WORDS_BIGENDIAN for (size_t s = 0; s < 8; ++s) for (size_t b = 0; b < 256; ++b) - crc32_table[s][b] = bswap32(crc32_table[s][b]); + crc32_table[s][b] = byteswap32(crc32_table[s][b]); #endif return; @@ -54,9 +53,11 @@ init_crc32_table(void) static void print_crc32_table(void) { - printf("/* This file has been automatically generated by " - "crc32_tablegen.c. */\n\n" - "const uint32_t lzma_crc32_table[8][256] = {\n\t{"); + // Split the SPDX string so that it won't accidentally match + // when tools search for the string. + printf("// SPDX" "-License-Identifier" ": 0BSD\n\n" + "// This file has been generated by crc32_tablegen.c.\n\n" + "const uint32_t lzma_crc32_table[8][256] = {\n\t{"); for (size_t s = 0; s < 8; ++s) { for (size_t b = 0; b < 256; ++b) { @@ -82,9 +83,11 @@ print_crc32_table(void) static void print_lz_table(void) { - printf("/* This file has been automatically generated by " - "crc32_tablegen.c. */\n\n" - "const uint32_t lzma_lz_hash_table[256] = {"); + // Split the SPDX string so that it won't accidentally match + // when tools search for the string. + printf("// SPDX" "-License-Identifier" ": 0BSD\n\n" + "// This file has been generated by crc32_tablegen.c.\n\n" + "const uint32_t lzma_lz_hash_table[256] = {"); for (size_t b = 0; b < 256; ++b) { if ((b % 4) == 0) diff --git a/src/liblzma/check/crc32_x86.S b/src/liblzma/check/crc32_x86.S index 4f395df..ddc3cee 100644 --- a/src/liblzma/check/crc32_x86.S +++ b/src/liblzma/check/crc32_x86.S @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: 0BSD */ + /* * Speed-optimized CRC32 using slicing-by-eight algorithm * @@ -11,9 +13,6 @@ * Authors: Igor Pavlov (original version) * Lasse Collin (AT&T syntax, PIC support, better portability) * - * This file has been put into the public domain. - * You can do whatever you want with this file. - * * This code needs lzma_crc32_table, which can be created using the * following C code: diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c index 0c8622a..0ce83fe 100644 --- a/src/liblzma/check/crc64_fast.c +++ b/src/liblzma/check/crc64_fast.c @@ -1,85 +1,30 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file crc64.c /// \brief CRC64 calculation -/// -/// There are two methods in this file. crc64_generic uses the -/// the slice-by-four algorithm. This is the same idea that is -/// used in crc32_fast.c, but for CRC64 we use only four tables -/// instead of eight to avoid increasing CPU cache usage. -/// -/// crc64_clmul uses 32/64-bit x86 SSSE3, SSE4.1, and CLMUL instructions. -/// It was derived from -/// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -/// and the public domain code from https://github.com/rawrunprotected/crc -/// (URLs were checked on 2022-11-07). -/// -/// FIXME: Builds for 32-bit x86 use crc64_x86.S by default instead -/// of this file and thus CLMUL version isn't available on 32-bit x86 -/// unless configured with --disable-assembler. Even then the lookup table -/// isn't omitted in crc64_table.c since it doesn't know that assembly -/// code has been disabled. // // Authors: Lasse Collin // Ilya Kurdyukov // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #include "check.h" +#include "crc_common.h" -#undef CRC_GENERIC -#undef CRC_CLMUL -#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS - -// If CLMUL cannot be used then only the generic slice-by-four is built. -#if !defined(HAVE_USABLE_CLMUL) -# define CRC_GENERIC 1 - -// If CLMUL is allowed unconditionally in the compiler options then the -// generic version can be omitted. Note that this doesn't work with MSVC -// as I don't know how to detect the features here. -// -// NOTE: Keep this this in sync with crc64_table.c. -#elif (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \ - || (defined(__e2k__) && __iset__ >= 6) -# define CRC_CLMUL 1 - -// Otherwise build both and detect at runtime which version to use. -#else -# define CRC_GENERIC 1 -# define CRC_CLMUL 1 - -/* - // The generic code is much faster with 1-8-byte inputs and has - // similar performance up to 16 bytes at least in microbenchmarks - // (it depends on input buffer alignment too). If both versions are - // built, this #define will use the generic version for inputs up to - // 16 bytes and CLMUL for bigger inputs. It saves a little in code - // size since the special cases for 0-16-byte inputs will be omitted - // from the CLMUL code. -# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1 -*/ - -# if defined(_MSC_VER) -# include <intrin.h> -# elif defined(HAVE_CPUID_H) -# include <cpuid.h> -# endif +#if defined(CRC_X86_CLMUL) +# define BUILDING_CRC64_CLMUL +# include "crc_x86_clmul.h" #endif +#ifdef CRC64_GENERIC + ///////////////////////////////// // Generic slice-by-four CRC64 // ///////////////////////////////// -#ifdef CRC_GENERIC - -#include "crc_macros.h" - - #ifdef WORDS_BIGENDIAN # define A1(x) ((x) >> 56) #else @@ -94,7 +39,7 @@ crc64_generic(const uint8_t *buf, size_t size, uint64_t crc) crc = ~crc; #ifdef WORDS_BIGENDIAN - crc = bswap64(crc); + crc = byteswap64(crc); #endif if (size > 4) { @@ -128,7 +73,7 @@ crc64_generic(const uint8_t *buf, size_t size, uint64_t crc) crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc); #ifdef WORDS_BIGENDIAN - crc = bswap64(crc); + crc = byteswap64(crc); #endif return ~crc; @@ -136,336 +81,40 @@ crc64_generic(const uint8_t *buf, size_t size, uint64_t crc) #endif -///////////////////// -// x86 CLMUL CRC64 // -///////////////////// - -#ifdef CRC_CLMUL - -#include <immintrin.h> - - -/* -// These functions were used to generate the constants -// at the top of crc64_clmul(). -static uint64_t -calc_lo(uint64_t poly) -{ - uint64_t a = poly; - uint64_t b = 0; - - for (unsigned i = 0; i < 64; ++i) { - b = (b >> 1) | (a << 63); - a = (a >> 1) ^ (a & 1 ? poly : 0); - } - - return b; -} - -static uint64_t -calc_hi(uint64_t poly, uint64_t a) -{ - for (unsigned i = 0; i < 64; ++i) - a = (a >> 1) ^ (a & 1 ? poly : 0); - - return a; -} -*/ - - -#define MASK_L(in, mask, r) \ - r = _mm_shuffle_epi8(in, mask) - -#define MASK_H(in, mask, r) \ - r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign)) - -#define MASK_LH(in, mask, low, high) \ - MASK_L(in, mask, low); \ - MASK_H(in, mask, high) - - -// MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC -// code when optimizations are enabled (release build). According to the bug -// report, the ebx register is corrupted and the calculated result is wrong. -// Trying to workaround the problem with "__asm mov ebx, ebx" didn't help. -// The following pragma works and performance is still good. x86-64 builds -// aren't affected by this problem. -// -// NOTE: Another pragma after the function restores the optimizations. -// If the #if condition here is updated, the other one must be updated too. -#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \ - && defined(_M_IX86) -# pragma optimize("g", off) -#endif - -// EDG-based compilers (Intel's classic compiler and compiler for E2K) can -// define __GNUC__ but the attribute must not be used with them. -// The new Clang-based ICX needs the attribute. -// -// NOTE: Build systems check for this too, keep them in sync with this. -#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) -__attribute__((__target__("ssse3,sse4.1,pclmul"))) -#endif -// The intrinsics use 16-byte-aligned reads from buf, thus they may read -// up to 15 bytes before or after the buffer (depending on the alignment -// of the buf argument). The values of the extra bytes are ignored. -// This unavoidably trips -fsanitize=address so address sanitizier has -// to be disabled for this function. -#if lzma_has_attribute(__no_sanitize_address__) -__attribute__((__no_sanitize_address__)) -#endif -static uint64_t -crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc) -{ - // The prototypes of the intrinsics use signed types while most of - // the values are treated as unsigned here. These warnings in this - // function have been checked and found to be harmless so silence them. -#if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wsign-conversion" -# pragma GCC diagnostic ignored "-Wconversion" -#endif - -#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS - // The code assumes that there is at least one byte of input. - if (size == 0) - return crc; -#endif - - // const uint64_t poly = 0xc96c5795d7870f42; // CRC polynomial - const uint64_t p = 0x92d8af2baf0e1e85; // (poly << 1) | 1 - const uint64_t mu = 0x9c3e466c172963d5; // (calc_lo(poly) << 1) | 1 - const uint64_t k2 = 0xdabe95afc7875f40; // calc_hi(poly, 1) - const uint64_t k1 = 0xe05dd497ca393ae4; // calc_hi(poly, k2) - const __m128i vfold0 = _mm_set_epi64x(p, mu); - const __m128i vfold1 = _mm_set_epi64x(k2, k1); - - // Create a vector with 8-bit values 0 to 15. This is used to - // construct control masks for _mm_blendv_epi8 and _mm_shuffle_epi8. - const __m128i vramp = _mm_setr_epi32( - 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c); - - // This is used to inverse the control mask of _mm_shuffle_epi8 - // so that bytes that wouldn't be picked with the original mask - // will be picked and vice versa. - const __m128i vsign = _mm_set1_epi8(0x80); - - // Memory addresses A to D and the distances between them: - // - // A B C D - // [skip_start][size][skip_end] - // [ size2 ] - // - // A and D are 16-byte aligned. B and C are 1-byte aligned. - // skip_start and skip_end are 0-15 bytes. size is at least 1 byte. - // - // A = aligned_buf will initially point to this address. - // B = The address pointed by the caller-supplied buf. - // C = buf + size == aligned_buf + size2 - // D = buf + size + skip_end == aligned_buf + size2 + skip_end - const size_t skip_start = (size_t)((uintptr_t)buf & 15); - const size_t skip_end = (size_t)((0U - (uintptr_t)(buf + size)) & 15); - const __m128i *aligned_buf = (const __m128i *)( - (uintptr_t)buf & ~(uintptr_t)15); - - // If size2 <= 16 then the whole input fits into a single 16-byte - // vector. If size2 > 16 then at least two 16-byte vectors must - // be processed. If size2 > 16 && size <= 16 then there is only - // one 16-byte vector's worth of input but it is unaligned in memory. - // - // NOTE: There is no integer overflow here if the arguments are valid. - // If this overflowed, buf + size would too. - size_t size2 = skip_start + size; - - // Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8: - // The first skip_start or skip_end bytes in the vectors will have - // the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8 - // will produce zeros for these positions. (Bitwise-xor of these - // masks with vsign will produce the opposite behavior.) - const __m128i mask_start - = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_start)); - const __m128i mask_end = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_end)); - - // Get the first 1-16 bytes into data0. If loading less than 16 bytes, - // the bytes are loaded to the high bits of the vector and the least - // significant positions are filled with zeros. - const __m128i data0 = _mm_blendv_epi8(_mm_load_si128(aligned_buf), - _mm_setzero_si128(), mask_start); - ++aligned_buf; - -#if defined(__i386__) || defined(_M_IX86) - const __m128i initial_crc = _mm_set_epi64x(0, ~crc); -#else - // GCC and Clang would produce good code with _mm_set_epi64x - // but MSVC needs _mm_cvtsi64_si128 on x86-64. - const __m128i initial_crc = _mm_cvtsi64_si128(~crc); -#endif +#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED) - __m128i v0, v1, v2, v3; - -#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS - if (size <= 16) { - // Right-shift initial_crc by 1-16 bytes based on "size" - // and store the result in v1 (high bytes) and v0 (low bytes). - // - // NOTE: The highest 8 bytes of initial_crc are zeros so - // v1 will be filled with zeros if size >= 8. The highest 8 - // bytes of v1 will always become zeros. - // - // [ v1 ][ v0 ] - // [ initial_crc ] size == 1 - // [ initial_crc ] size == 2 - // [ initial_crc ] size == 15 - // [ initial_crc ] size == 16 (all in v0) - const __m128i mask_low = _mm_add_epi8( - vramp, _mm_set1_epi8(size - 16)); - MASK_LH(initial_crc, mask_low, v0, v1); - - if (size2 <= 16) { - // There are 1-16 bytes of input and it is all - // in data0. Copy the input bytes to v3. If there - // are fewer than 16 bytes, the low bytes in v3 - // will be filled with zeros. That is, the input - // bytes are stored to the same position as - // (part of) initial_crc is in v0. - MASK_L(data0, mask_end, v3); - } else { - // There are 2-16 bytes of input but not all bytes - // are in data0. - const __m128i data1 = _mm_load_si128(aligned_buf); - - // Collect the 2-16 input bytes from data0 and data1 - // to v2 and v3, and bitwise-xor them with the - // low bits of initial_crc in v0. Note that the - // the second xor is below this else-block as it - // is shared with the other branch. - MASK_H(data0, mask_end, v2); - MASK_L(data1, mask_end, v3); - v0 = _mm_xor_si128(v0, v2); - } +////////////////////////// +// Function dispatching // +////////////////////////// - v0 = _mm_xor_si128(v0, v3); - v1 = _mm_alignr_epi8(v1, v0, 8); - } else -#endif - { - const __m128i data1 = _mm_load_si128(aligned_buf); - MASK_LH(initial_crc, mask_start, v0, v1); - v0 = _mm_xor_si128(v0, data0); - v1 = _mm_xor_si128(v1, data1); - -#define FOLD \ - v1 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x00)); \ - v0 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x11)); - - while (size2 > 32) { - ++aligned_buf; - size2 -= 16; - FOLD - v1 = _mm_load_si128(aligned_buf); - } - - if (size2 < 32) { - MASK_H(v0, mask_end, v2); - MASK_L(v0, mask_end, v0); - MASK_L(v1, mask_end, v3); - v1 = _mm_or_si128(v2, v3); - } - - FOLD - v1 = _mm_srli_si128(v0, 8); -#undef FOLD - } +// If both the generic and arch-optimized implementations are usable, then +// the function that is used is selected at runtime. See crc32_fast.c. - v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold1, 0x10), v1); - v0 = _mm_clmulepi64_si128(v1, vfold0, 0x00); - v2 = _mm_clmulepi64_si128(v0, vfold0, 0x10); - v0 = _mm_xor_si128(_mm_xor_si128(v2, _mm_slli_si128(v0, 8)), v1); +typedef uint64_t (*crc64_func_type)( + const uint8_t *buf, size_t size, uint64_t crc); -#if defined(__i386__) || defined(_M_IX86) - return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) | - (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2)); -#else - return ~(uint64_t)_mm_extract_epi64(v0, 1); -#endif - -#if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__) -# pragma GCC diagnostic pop -#endif -} -#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \ - && defined(_M_IX86) -# pragma optimize("", on) -#endif -#endif - - -//////////////////////// -// Detect CPU support // -//////////////////////// - -#if defined(CRC_GENERIC) && defined(CRC_CLMUL) -static inline bool -is_clmul_supported(void) +static crc64_func_type +crc64_resolve(void) { - int success = 1; - uint32_t r[4]; // eax, ebx, ecx, edx - -#if defined(_MSC_VER) - // This needs <intrin.h> with MSVC. ICC has it as a built-in - // on all platforms. - __cpuid(r, 1); -#elif defined(HAVE_CPUID_H) - // Compared to just using __asm__ to run CPUID, this also checks - // that CPUID is supported and saves and restores ebx as that is - // needed with GCC < 5 with position-independent code (PIC). - success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); -#else - // Just a fallback that shouldn't be needed. - __asm__("cpuid\n\t" - : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) - : "a"(1), "c"(0)); -#endif - - // Returns true if these are supported: - // CLMUL (bit 1 in ecx) - // SSSE3 (bit 9 in ecx) - // SSE4.1 (bit 19 in ecx) - const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); - return success && (r[2] & ecx_mask) == ecx_mask; - - // Alternative methods that weren't used: - // - ICC's _may_i_use_cpu_feature: the other methods should work too. - // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") - // - // CPUID decding is needed with MSVC anyway and older GCC. This keeps - // the feature checks in the build system simpler too. The nice thing - // about __builtin_cpu_supports would be that it generates very short - // code as is it only reads a variable set at startup but a few bytes - // doesn't matter here. + return is_arch_extension_supported() + ? &crc64_arch_optimized : &crc64_generic; } - #ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR -# define CRC64_FUNC_INIT # define CRC64_SET_FUNC_ATTR __attribute__((__constructor__)) +static crc64_func_type crc64_func; #else -# define CRC64_FUNC_INIT = &crc64_dispatch # define CRC64_SET_FUNC_ATTR static uint64_t crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc); +static crc64_func_type crc64_func = &crc64_dispatch; #endif -// Pointer to the the selected CRC64 method. -static uint64_t (*crc64_func)(const uint8_t *buf, size_t size, uint64_t crc) - CRC64_FUNC_INIT; - - CRC64_SET_FUNC_ATTR static void crc64_set_func(void) { - crc64_func = is_clmul_supported() ? &crc64_clmul : &crc64_generic; + crc64_func = crc64_resolve(); return; } @@ -474,12 +123,6 @@ crc64_set_func(void) static uint64_t crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc) { - // When __attribute__((__constructor__)) isn't supported, set the - // function pointer without any locking. If multiple threads run - // the detection code in parallel, they will all end up setting - // the pointer to the same value. This avoids the use of - // mythread_once() on every call to lzma_crc64() but this likely - // isn't strictly standards compliant. Let's change it if it breaks. crc64_set_func(); return crc64_func(buf, size, crc); } @@ -490,47 +133,22 @@ crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc) extern LZMA_API(uint64_t) lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc) { -#if defined(CRC_GENERIC) && defined(CRC_CLMUL) - // If CLMUL is available, it is the best for non-tiny inputs, - // being over twice as fast as the generic slice-by-four version. - // However, for size <= 16 it's different. In the extreme case - // of size == 1 the generic version can be five times faster. - // At size >= 8 the CLMUL starts to become reasonable. It - // varies depending on the alignment of buf too. - // - // The above doesn't include the overhead of mythread_once(). - // At least on x86-64 GNU/Linux, pthread_once() is very fast but - // it still makes lzma_crc64(buf, 1, crc) 50-100 % slower. When - // size reaches 12-16 bytes the overhead becomes negligible. - // - // So using the generic version for size <= 16 may give better - // performance with tiny inputs but if such inputs happen rarely - // it's not so obvious because then the lookup table of the - // generic version may not be in the processor cache. +#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED) + #ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS if (size <= 16) return crc64_generic(buf, size, crc); #endif - -/* -#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR - // See crc64_dispatch(). This would be the alternative which uses - // locking and doesn't use crc64_dispatch(). Note that on Windows - // this method needs Vista threads. - mythread_once(crc64_set_func); -#endif -*/ - return crc64_func(buf, size, crc); -#elif defined(CRC_CLMUL) - // If CLMUL is used unconditionally without runtime CPU detection - // then omitting the generic version and its 8 KiB lookup table - // makes the library smaller. +#elif defined(CRC64_ARCH_OPTIMIZED) + // If arch-optimized version is used unconditionally without runtime + // CPU detection then omitting the generic version and its 8 KiB + // lookup table makes the library smaller. // // FIXME: Lookup table isn't currently omitted on 32-bit x86, // see crc64_table.c. - return crc64_clmul(buf, size, crc); + return crc64_arch_optimized(buf, size, crc); #else return crc64_generic(buf, size, crc); diff --git a/src/liblzma/check/crc64_small.c b/src/liblzma/check/crc64_small.c index 420f7cf..ee4ea26 100644 --- a/src/liblzma/check/crc64_small.c +++ b/src/liblzma/check/crc64_small.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file crc64_small.c @@ -5,9 +7,6 @@ // // Author: Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #include "check.h" diff --git a/src/liblzma/check/crc64_table.c b/src/liblzma/check/crc64_table.c index 688e527..78e4275 100644 --- a/src/liblzma/check/crc64_table.c +++ b/src/liblzma/check/crc64_table.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file crc64_table.c @@ -5,19 +7,21 @@ // // Author: Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #include "common.h" -// FIXME: Compared to crc64_fast.c this has to check for __x86_64__ too +// FIXME: Compared to crc_common.h this has to check for __x86_64__ too // so that in 32-bit builds crc64_x86.S won't break due to a missing table. -#if (defined(__x86_64__) && defined(__SSSE3__) \ +#if defined(HAVE_USABLE_CLMUL) && ((defined(__x86_64__) && defined(__SSSE3__) \ && defined(__SSE4_1__) && defined(__PCLMUL__)) \ - || (defined(__e2k__) && __iset__ >= 6) + || (defined(__e2k__) && __iset__ >= 6)) +# define NO_CRC64_TABLE +#endif + + +#ifdef NO_CRC64_TABLE // No table needed. Use a typedef to avoid an empty translation unit. typedef void lzma_crc64_dummy; diff --git a/src/liblzma/check/crc64_table_be.h b/src/liblzma/check/crc64_table_be.h index ea074f3..db76cc7 100644 --- a/src/liblzma/check/crc64_table_be.h +++ b/src/liblzma/check/crc64_table_be.h @@ -1,4 +1,6 @@ -/* This file has been automatically generated by crc64_tablegen.c. */ +// SPDX-License-Identifier: 0BSD + +// This file has been generated by crc64_tablegen.c. const uint64_t lzma_crc64_table[4][256] = { { diff --git a/src/liblzma/check/crc64_table_le.h b/src/liblzma/check/crc64_table_le.h index 1196b31..e40a8c8 100644 --- a/src/liblzma/check/crc64_table_le.h +++ b/src/liblzma/check/crc64_table_le.h @@ -1,4 +1,6 @@ -/* This file has been automatically generated by crc64_tablegen.c. */ +// SPDX-License-Identifier: 0BSD + +// This file has been generated by crc64_tablegen.c. const uint64_t lzma_crc64_table[4][256] = { { diff --git a/src/liblzma/check/crc64_tablegen.c b/src/liblzma/check/crc64_tablegen.c index fddaa7e..2035127 100644 --- a/src/liblzma/check/crc64_tablegen.c +++ b/src/liblzma/check/crc64_tablegen.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file crc64_tablegen.c @@ -8,9 +10,6 @@ // // Author: Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #include <stdio.h> @@ -43,7 +42,7 @@ init_crc64_table(void) #ifdef WORDS_BIGENDIAN for (size_t s = 0; s < 4; ++s) for (size_t b = 0; b < 256; ++b) - crc64_table[s][b] = bswap64(crc64_table[s][b]); + crc64_table[s][b] = byteswap64(crc64_table[s][b]); #endif return; @@ -53,9 +52,11 @@ init_crc64_table(void) static void print_crc64_table(void) { - printf("/* This file has been automatically generated by " - "crc64_tablegen.c. */\n\n" - "const uint64_t lzma_crc64_table[4][256] = {\n\t{"); + // Split the SPDX string so that it won't accidentally match + // when tools search for the string. + printf("// SPDX" "-License-Identifier" ": 0BSD\n\n" + "// This file has been generated by crc64_tablegen.c.\n\n" + "const uint64_t lzma_crc64_table[4][256] = {\n\t{"); for (size_t s = 0; s < 4; ++s) { for (size_t b = 0; b < 256; ++b) { diff --git a/src/liblzma/check/crc64_x86.S b/src/liblzma/check/crc64_x86.S index 9aecf58..47f6081 100644 --- a/src/liblzma/check/crc64_x86.S +++ b/src/liblzma/check/crc64_x86.S @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: 0BSD */ + /* * Speed-optimized CRC64 using slicing-by-four algorithm * @@ -7,9 +9,6 @@ * Authors: Igor Pavlov (original CRC32 assembly code) * Lasse Collin (CRC64 adaptation of the modified CRC32 code) * - * This file has been put into the public domain. - * You can do whatever you want with this file. - * * This code needs lzma_crc64_table, which can be created using the * following C code: diff --git a/src/liblzma/check/crc_common.h b/src/liblzma/check/crc_common.h new file mode 100644 index 0000000..63a7b5c --- /dev/null +++ b/src/liblzma/check/crc_common.h @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file crc_common.h +/// \brief Some functions and macros for CRC32 and CRC64 +// +// Authors: Lasse Collin +// Ilya Kurdyukov +// Hans Jansen +// Jia Tan +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef LZMA_CRC_COMMON_H +#define LZMA_CRC_COMMON_H + +#include "common.h" + + +#ifdef WORDS_BIGENDIAN +# define A(x) ((x) >> 24) +# define B(x) (((x) >> 16) & 0xFF) +# define C(x) (((x) >> 8) & 0xFF) +# define D(x) ((x) & 0xFF) + +# define S8(x) ((x) << 8) +# define S32(x) ((x) << 32) + +#else +# define A(x) ((x) & 0xFF) +# define B(x) (((x) >> 8) & 0xFF) +# define C(x) (((x) >> 16) & 0xFF) +# define D(x) ((x) >> 24) + +# define S8(x) ((x) >> 8) +# define S32(x) ((x) >> 32) +#endif + + +// CRC CLMUL code needs this because accessing input buffers that aren't +// aligned to the vector size will inherently trip the address sanitizer. +#if lzma_has_attribute(__no_sanitize_address__) +# define crc_attr_no_sanitize_address \ + __attribute__((__no_sanitize_address__)) +#else +# define crc_attr_no_sanitize_address +#endif + +// Keep this in sync with changes to crc32_arm64.h +#if defined(_WIN32) || defined(HAVE_GETAUXVAL) \ + || defined(HAVE_ELF_AUX_INFO) \ + || (defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)) +# define ARM64_RUNTIME_DETECTION 1 +#endif + + +#undef CRC32_GENERIC +#undef CRC64_GENERIC + +#undef CRC32_ARCH_OPTIMIZED +#undef CRC64_ARCH_OPTIMIZED + +// The x86 CLMUL is used for both CRC32 and CRC64. +#undef CRC_X86_CLMUL + +#undef CRC32_ARM64 +#undef CRC64_ARM64_CLMUL + +#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS + +// ARM64 CRC32 instruction is only useful for CRC32. Currently, only +// little endian is supported since we were unable to test on a big +// endian machine. +// +// NOTE: Keep this and the next check in sync with the macro +// NO_CRC32_TABLE in crc32_table.c +#if defined(HAVE_ARM64_CRC32) && !defined(WORDS_BIGENDIAN) +// Allow ARM64 CRC32 instruction without a runtime check if +// __ARM_FEATURE_CRC32 is defined. GCC and Clang only define this if the +// proper compiler options are used. +# if defined(__ARM_FEATURE_CRC32) +# define CRC32_ARCH_OPTIMIZED 1 +# define CRC32_ARM64 1 +# elif defined(ARM64_RUNTIME_DETECTION) +# define CRC32_ARCH_OPTIMIZED 1 +# define CRC32_ARM64 1 +# define CRC32_GENERIC 1 +# endif +#endif + +#if defined(HAVE_USABLE_CLMUL) +// If CLMUL is allowed unconditionally in the compiler options then the +// generic version can be omitted. Note that this doesn't work with MSVC +// as I don't know how to detect the features here. +// +// NOTE: Keep this in sync with the NO_CRC32_TABLE macro in crc32_table.c +// and NO_CRC64_TABLE in crc64_table.c. +# if (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \ + || (defined(__e2k__) && __iset__ >= 6) +# define CRC32_ARCH_OPTIMIZED 1 +# define CRC64_ARCH_OPTIMIZED 1 +# define CRC_X86_CLMUL 1 +# else +# define CRC32_GENERIC 1 +# define CRC64_GENERIC 1 +# define CRC32_ARCH_OPTIMIZED 1 +# define CRC64_ARCH_OPTIMIZED 1 +# define CRC_X86_CLMUL 1 + +/* + // The generic code is much faster with 1-8-byte inputs and + // has similar performance up to 16 bytes at least in + // microbenchmarks (it depends on input buffer alignment + // too). If both versions are built, this #define will use + // the generic version for inputs up to 16 bytes and CLMUL + // for bigger inputs. It saves a little in code size since + // the special cases for 0-16-byte inputs will be omitted + // from the CLMUL code. +# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1 +*/ +# endif +#endif + +// For CRC32 use the generic slice-by-eight implementation if no optimized +// version is available. +#if !defined(CRC32_ARCH_OPTIMIZED) && !defined(CRC32_GENERIC) +# define CRC32_GENERIC 1 +#endif + +// For CRC64 use the generic slice-by-four implementation if no optimized +// version is available. +#if !defined(CRC64_ARCH_OPTIMIZED) && !defined(CRC64_GENERIC) +# define CRC64_GENERIC 1 +#endif + +#endif diff --git a/src/liblzma/check/crc_macros.h b/src/liblzma/check/crc_macros.h deleted file mode 100644 index a7c21b7..0000000 --- a/src/liblzma/check/crc_macros.h +++ /dev/null @@ -1,30 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////// -// -/// \file crc_macros.h -/// \brief Some endian-dependent macros for CRC32 and CRC64 -// -// Author: Lasse Collin -// -// This file has been put into the public domain. -// You can do whatever you want with this file. -// -/////////////////////////////////////////////////////////////////////////////// - -#ifdef WORDS_BIGENDIAN -# define A(x) ((x) >> 24) -# define B(x) (((x) >> 16) & 0xFF) -# define C(x) (((x) >> 8) & 0xFF) -# define D(x) ((x) & 0xFF) - -# define S8(x) ((x) << 8) -# define S32(x) ((x) << 32) - -#else -# define A(x) ((x) & 0xFF) -# define B(x) (((x) >> 8) & 0xFF) -# define C(x) (((x) >> 16) & 0xFF) -# define D(x) ((x) >> 24) - -# define S8(x) ((x) >> 8) -# define S32(x) ((x) >> 32) -#endif diff --git a/src/liblzma/check/crc_x86_clmul.h b/src/liblzma/check/crc_x86_clmul.h new file mode 100644 index 0000000..f1254ec --- /dev/null +++ b/src/liblzma/check/crc_x86_clmul.h @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: 0BSD + +/////////////////////////////////////////////////////////////////////////////// +// +/// \file crc_x86_clmul.h +/// \brief CRC32 and CRC64 implementations using CLMUL instructions. +/// +/// The CRC32 and CRC64 implementations use 32/64-bit x86 SSSE3, SSE4.1, and +/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too. +/// +/// They were derived from +/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation +/// and the public domain code from https://github.com/rawrunprotected/crc +/// (URLs were checked on 2023-10-14). +/// +/// While this file has both CRC32 and CRC64 implementations, only one +/// should be built at a time to ensure that crc_simd_body() is inlined +/// even with compilers with which lzma_always_inline expands to plain inline. +/// The version to build is selected by defining BUILDING_CRC32_CLMUL or +/// BUILDING_CRC64_CLMUL before including this file. +/// +/// FIXME: Builds for 32-bit x86 use the assembly .S files by default +/// unless configured with --disable-assembler. Even then the lookup table +/// isn't omitted in crc64_table.c since it doesn't know that assembly +/// code has been disabled. +// +// Authors: Ilya Kurdyukov +// Hans Jansen +// Lasse Collin +// Jia Tan +// +/////////////////////////////////////////////////////////////////////////////// + +// This file must not be included more than once. +#ifdef LZMA_CRC_X86_CLMUL_H +# error crc_x86_clmul.h was included twice. +#endif +#define LZMA_CRC_X86_CLMUL_H + +#include <immintrin.h> + +#if defined(_MSC_VER) +# include <intrin.h> +#elif defined(HAVE_CPUID_H) +# include <cpuid.h> +#endif + + +// EDG-based compilers (Intel's classic compiler and compiler for E2K) can +// define __GNUC__ but the attribute must not be used with them. +// The new Clang-based ICX needs the attribute. +// +// NOTE: Build systems check for this too, keep them in sync with this. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) +# define crc_attr_target \ + __attribute__((__target__("ssse3,sse4.1,pclmul"))) +#else +# define crc_attr_target +#endif + + +#define MASK_L(in, mask, r) r = _mm_shuffle_epi8(in, mask) + +#define MASK_H(in, mask, r) \ + r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign)) + +#define MASK_LH(in, mask, low, high) \ + MASK_L(in, mask, low); \ + MASK_H(in, mask, high) + + +crc_attr_target +crc_attr_no_sanitize_address +static lzma_always_inline void +crc_simd_body(const uint8_t *buf, const size_t size, __m128i *v0, __m128i *v1, + const __m128i vfold16, const __m128i initial_crc) +{ + // Create a vector with 8-bit values 0 to 15. This is used to + // construct control masks for _mm_blendv_epi8 and _mm_shuffle_epi8. + const __m128i vramp = _mm_setr_epi32( + 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c); + + // This is used to inverse the control mask of _mm_shuffle_epi8 + // so that bytes that wouldn't be picked with the original mask + // will be picked and vice versa. + const __m128i vsign = _mm_set1_epi8(-0x80); + + // Memory addresses A to D and the distances between them: + // + // A B C D + // [skip_start][size][skip_end] + // [ size2 ] + // + // A and D are 16-byte aligned. B and C are 1-byte aligned. + // skip_start and skip_end are 0-15 bytes. size is at least 1 byte. + // + // A = aligned_buf will initially point to this address. + // B = The address pointed by the caller-supplied buf. + // C = buf + size == aligned_buf + size2 + // D = buf + size + skip_end == aligned_buf + size2 + skip_end + const size_t skip_start = (size_t)((uintptr_t)buf & 15); + const size_t skip_end = (size_t)((0U - (uintptr_t)(buf + size)) & 15); + const __m128i *aligned_buf = (const __m128i *)( + (uintptr_t)buf & ~(uintptr_t)15); + + // If size2 <= 16 then the whole input fits into a single 16-byte + // vector. If size2 > 16 then at least two 16-byte vectors must + // be processed. If size2 > 16 && size <= 16 then there is only + // one 16-byte vector's worth of input but it is unaligned in memory. + // + // NOTE: There is no integer overflow here if the arguments + // are valid. If this overflowed, buf + size would too. + const size_t size2 = skip_start + size; + + // Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8: + // The first skip_start or skip_end bytes in the vectors will have + // the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8 + // will produce zeros for these positions. (Bitwise-xor of these + // masks with vsign will produce the opposite behavior.) + const __m128i mask_start + = _mm_sub_epi8(vramp, _mm_set1_epi8((char)skip_start)); + const __m128i mask_end + = _mm_sub_epi8(vramp, _mm_set1_epi8((char)skip_end)); + + // Get the first 1-16 bytes into data0. If loading less than 16 + // bytes, the bytes are loaded to the high bits of the vector and + // the least significant positions are filled with zeros. + const __m128i data0 = _mm_blendv_epi8(_mm_load_si128(aligned_buf), + _mm_setzero_si128(), mask_start); + aligned_buf++; + + __m128i v2, v3; + +#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS + if (size <= 16) { + // Right-shift initial_crc by 1-16 bytes based on "size" + // and store the result in v1 (high bytes) and v0 (low bytes). + // + // NOTE: The highest 8 bytes of initial_crc are zeros so + // v1 will be filled with zeros if size >= 8. The highest + // 8 bytes of v1 will always become zeros. + // + // [ v1 ][ v0 ] + // [ initial_crc ] size == 1 + // [ initial_crc ] size == 2 + // [ initial_crc ] size == 15 + // [ initial_crc ] size == 16 (all in v0) + const __m128i mask_low = _mm_add_epi8( + vramp, _mm_set1_epi8((char)(size - 16))); + MASK_LH(initial_crc, mask_low, *v0, *v1); + + if (size2 <= 16) { + // There are 1-16 bytes of input and it is all + // in data0. Copy the input bytes to v3. If there + // are fewer than 16 bytes, the low bytes in v3 + // will be filled with zeros. That is, the input + // bytes are stored to the same position as + // (part of) initial_crc is in v0. + MASK_L(data0, mask_end, v3); + } else { + // There are 2-16 bytes of input but not all bytes + // are in data0. + const __m128i data1 = _mm_load_si128(aligned_buf); + + // Collect the 2-16 input bytes from data0 and data1 + // to v2 and v3, and bitwise-xor them with the + // low bits of initial_crc in v0. Note that the + // the second xor is below this else-block as it + // is shared with the other branch. + MASK_H(data0, mask_end, v2); + MASK_L(data1, mask_end, v3); + *v0 = _mm_xor_si128(*v0, v2); + } + + *v0 = _mm_xor_si128(*v0, v3); + *v1 = _mm_alignr_epi8(*v1, *v0, 8); + } else +#endif + { + // There is more than 16 bytes of input. + const __m128i data1 = _mm_load_si128(aligned_buf); + const __m128i *end = (const __m128i*)( + (const char *)aligned_buf - 16 + size2); + aligned_buf++; + + MASK_LH(initial_crc, mask_start, *v0, *v1); + *v0 = _mm_xor_si128(*v0, data0); + *v1 = _mm_xor_si128(*v1, data1); + + while (aligned_buf < end) { + *v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128( + *v0, vfold16, 0x00)); + *v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128( + *v0, vfold16, 0x11)); + *v1 = _mm_load_si128(aligned_buf++); + } + + if (aligned_buf != end) { + MASK_H(*v0, mask_end, v2); + MASK_L(*v0, mask_end, *v0); + MASK_L(*v1, mask_end, v3); + *v1 = _mm_or_si128(v2, v3); + } + + *v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128( + *v0, vfold16, 0x00)); + *v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128( + *v0, vfold16, 0x11)); + *v1 = _mm_srli_si128(*v0, 8); + } +} + + +///////////////////// +// x86 CLMUL CRC32 // +///////////////////// + +/* +// These functions were used to generate the constants +// at the top of crc32_arch_optimized(). +static uint64_t +calc_lo(uint64_t p, uint64_t a, int n) +{ + uint64_t b = 0; int i; + for (i = 0; i < n; i++) { + b = b >> 1 | (a & 1) << (n - 1); + a = (a >> 1) ^ ((0 - (a & 1)) & p); + } + return b; +} + +// same as ~crc(&a, sizeof(a), ~0) +static uint64_t +calc_hi(uint64_t p, uint64_t a, int n) +{ + int i; + for (i = 0; i < n; i++) + a = (a >> 1) ^ ((0 - (a & 1)) & p); + return a; +} +*/ + +#ifdef BUILDING_CRC32_CLMUL + +crc_attr_target +crc_attr_no_sanitize_address +static uint32_t +crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc) +{ +#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS + // The code assumes that there is at least one byte of input. + if (size == 0) + return crc; +#endif + + // uint32_t poly = 0xedb88320; + const int64_t p = 0x1db710640; // p << 1 + const int64_t mu = 0x1f7011641; // calc_lo(p, p, 32) << 1 | 1 + const int64_t k5 = 0x163cd6124; // calc_hi(p, p, 32) << 1 + const int64_t k4 = 0x0ccaa009e; // calc_hi(p, p, 64) << 1 + const int64_t k3 = 0x1751997d0; // calc_hi(p, p, 128) << 1 + + const __m128i vfold4 = _mm_set_epi64x(mu, p); + const __m128i vfold8 = _mm_set_epi64x(0, k5); + const __m128i vfold16 = _mm_set_epi64x(k4, k3); + + __m128i v0, v1, v2; + + crc_simd_body(buf, size, &v0, &v1, vfold16, + _mm_cvtsi32_si128((int32_t)~crc)); + + v1 = _mm_xor_si128( + _mm_clmulepi64_si128(v0, vfold16, 0x10), v1); // xxx0 + v2 = _mm_shuffle_epi32(v1, 0xe7); // 0xx0 + v0 = _mm_slli_epi64(v1, 32); // [0] + v0 = _mm_clmulepi64_si128(v0, vfold8, 0x00); + v0 = _mm_xor_si128(v0, v2); // [1] [2] + v2 = _mm_clmulepi64_si128(v0, vfold4, 0x10); + v2 = _mm_clmulepi64_si128(v2, vfold4, 0x00); + v0 = _mm_xor_si128(v0, v2); // [2] + return ~(uint32_t)_mm_extract_epi32(v0, 2); +} +#endif // BUILDING_CRC32_CLMUL + + +///////////////////// +// x86 CLMUL CRC64 // +///////////////////// + +/* +// These functions were used to generate the constants +// at the top of crc64_arch_optimized(). +static uint64_t +calc_lo(uint64_t poly) +{ + uint64_t a = poly; + uint64_t b = 0; + + for (unsigned i = 0; i < 64; ++i) { + b = (b >> 1) | (a << 63); + a = (a >> 1) ^ (a & 1 ? poly : 0); + } + + return b; +} + +static uint64_t +calc_hi(uint64_t poly, uint64_t a) +{ + for (unsigned i = 0; i < 64; ++i) + a = (a >> 1) ^ (a & 1 ? poly : 0); + + return a; +} +*/ + +#ifdef BUILDING_CRC64_CLMUL + +// MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC +// code when optimizations are enabled (release build). According to the bug +// report, the ebx register is corrupted and the calculated result is wrong. +// Trying to workaround the problem with "__asm mov ebx, ebx" didn't help. +// The following pragma works and performance is still good. x86-64 builds +// and CRC32 CLMUL aren't affected by this problem. The problem does not +// happen in crc_simd_body() either (which is shared with CRC32 CLMUL anyway). +// +// NOTE: Another pragma after crc64_arch_optimized() restores +// the optimizations. If the #if condition here is updated, +// the other one must be updated too. +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \ + && defined(_M_IX86) +# pragma optimize("g", off) +#endif + +crc_attr_target +crc_attr_no_sanitize_address +static uint64_t +crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc) +{ +#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS + // The code assumes that there is at least one byte of input. + if (size == 0) + return crc; +#endif + + // const uint64_t poly = 0xc96c5795d7870f42; // CRC polynomial + const uint64_t p = 0x92d8af2baf0e1e85; // (poly << 1) | 1 + const uint64_t mu = 0x9c3e466c172963d5; // (calc_lo(poly) << 1) | 1 + const uint64_t k2 = 0xdabe95afc7875f40; // calc_hi(poly, 1) + const uint64_t k1 = 0xe05dd497ca393ae4; // calc_hi(poly, k2) + + const __m128i vfold8 = _mm_set_epi64x((int64_t)p, (int64_t)mu); + const __m128i vfold16 = _mm_set_epi64x((int64_t)k2, (int64_t)k1); + + __m128i v0, v1, v2; + +#if defined(__i386__) || defined(_M_IX86) + crc_simd_body(buf, size, &v0, &v1, vfold16, + _mm_set_epi64x(0, (int64_t)~crc)); +#else + // GCC and Clang would produce good code with _mm_set_epi64x + // but MSVC needs _mm_cvtsi64_si128 on x86-64. + crc_simd_body(buf, size, &v0, &v1, vfold16, + _mm_cvtsi64_si128((int64_t)~crc)); +#endif + + v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold16, 0x10), v1); + v0 = _mm_clmulepi64_si128(v1, vfold8, 0x00); + v2 = _mm_clmulepi64_si128(v0, vfold8, 0x10); + v0 = _mm_xor_si128(_mm_xor_si128(v1, _mm_slli_si128(v0, 8)), v2); + +#if defined(__i386__) || defined(_M_IX86) + return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) | + (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2)); +#else + return ~(uint64_t)_mm_extract_epi64(v0, 1); +#endif +} + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \ + && defined(_M_IX86) +# pragma optimize("", on) +#endif + +#endif // BUILDING_CRC64_CLMUL + + +// Inlining this function duplicates the function body in crc32_resolve() and +// crc64_resolve(), but this is acceptable because this is a tiny function. +static inline bool +is_arch_extension_supported(void) +{ + int success = 1; + uint32_t r[4]; // eax, ebx, ecx, edx + +#if defined(_MSC_VER) + // This needs <intrin.h> with MSVC. ICC has it as a built-in + // on all platforms. + __cpuid(r, 1); +#elif defined(HAVE_CPUID_H) + // Compared to just using __asm__ to run CPUID, this also checks + // that CPUID is supported and saves and restores ebx as that is + // needed with GCC < 5 with position-independent code (PIC). + success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); +#else + // Just a fallback that shouldn't be needed. + __asm__("cpuid\n\t" + : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) + : "a"(1), "c"(0)); +#endif + + // Returns true if these are supported: + // CLMUL (bit 1 in ecx) + // SSSE3 (bit 9 in ecx) + // SSE4.1 (bit 19 in ecx) + const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); + return success && (r[2] & ecx_mask) == ecx_mask; + + // Alternative methods that weren't used: + // - ICC's _may_i_use_cpu_feature: the other methods should work too. + // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") + // + // CPUID decding is needed with MSVC anyway and older GCC. This keeps + // the feature checks in the build system simpler too. The nice thing + // about __builtin_cpu_supports would be that it generates very short + // code as is it only reads a variable set at startup but a few bytes + // doesn't matter here. +} diff --git a/src/liblzma/check/sha256.c b/src/liblzma/check/sha256.c index 6feb342..bd0d280 100644 --- a/src/liblzma/check/sha256.c +++ b/src/liblzma/check/sha256.c @@ -1,24 +1,17 @@ +// SPDX-License-Identifier: 0BSD + /////////////////////////////////////////////////////////////////////////////// // /// \file sha256.c /// \brief SHA-256 -/// -/// \todo Crypto++ has x86 ASM optimizations. They use SSE so if they -/// are imported to liblzma, SSE instructions need to be used -/// conditionally to keep the code working on older boxes. // -// This code is based on the code found from 7-Zip, which has a modified -// version of the SHA-256 found from Crypto++ <https://www.cryptopp.com/>. -// The code was modified a little to fit into liblzma. +// The C code is based on the public domain SHA-256 code found from +// Crypto++ Library 5.5.1 released in 2007: https://www.cryptopp.com/ +// A few minor tweaks have been made in liblzma. // -// Authors: Kevin Springle -// Wei Dai -// Igor Pavlov +// Authors: Wei Dai // Lasse Collin // -// This file has been put into the public domain. -// You can do whatever you want with this file. -// /////////////////////////////////////////////////////////////////////////////// #include "check.h" |