summaryrefslogtreecommitdiffstats
path: root/src/liblzma/check/crc32_arm64.h
blob: 39c1c63ec0eced25ad5042e1611ecd8064831d8d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// SPDX-License-Identifier: 0BSD

///////////////////////////////////////////////////////////////////////////////
//
/// \file       crc32_arm64.h
/// \brief      CRC32 calculation with ARM64 optimization
//
//  Authors:    Chenxi Mao
//              Jia Tan
//              Hans Jansen
//
///////////////////////////////////////////////////////////////////////////////

#ifndef LZMA_CRC32_ARM64_H
#define LZMA_CRC32_ARM64_H

// MSVC always has the CRC intrinsics available when building for ARM64
// there is no need to include any header files.
#ifndef _MSC_VER
#	include <arm_acle.h>
#endif

// If both versions are going to be built, we need runtime detection
// to check if the instructions are supported.
#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
#	if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
#		include <sys/auxv.h>
#	elif defined(_WIN32)
#		include <processthreadsapi.h>
#	elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
#		include <sys/sysctl.h>
#	endif
#endif

// Some EDG-based compilers support ARM64 and define __GNUC__
// (such as Nvidia's nvcc), but do not support function attributes.
//
// NOTE: Build systems check for this too, keep them in sync with this.
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
#	define crc_attr_target __attribute__((__target__("+crc")))
#else
#	define crc_attr_target
#endif


crc_attr_target
static uint32_t
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
{
	crc = ~crc;

	// Align the input buffer because this was shown to be
	// significantly faster than unaligned accesses.
	const size_t align_amount = my_min(size, (0U - (uintptr_t)buf) & 7);

	for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf)
		crc = __crc32b(crc, *buf);

	size -= align_amount;

	// Process 8 bytes at a time. The end point is determined by
	// ignoring the least significant three bits of size to ensure
	// we do not process past the bounds of the buffer. This guarantees
	// that limit is a multiple of 8 and is strictly less than size.
	for (const uint8_t *limit = buf + (size & ~(size_t)7);
			buf < limit; buf += 8)
		crc = __crc32d(crc, aligned_read64le(buf));

	// Process the remaining bytes that are not 8 byte aligned.
	for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf)
		crc = __crc32b(crc, *buf);

	return ~crc;
}


#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
static inline bool
is_arch_extension_supported(void)
{
#if defined(HAVE_GETAUXVAL)
	return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;

#elif defined(HAVE_ELF_AUX_INFO)
	unsigned long feature_flags;

	if (elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags)) != 0)
		return false;

	return (feature_flags & HWCAP_CRC32) != 0;

#elif defined(_WIN32)
	return IsProcessorFeaturePresent(
			PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);

#elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
	int has_crc32 = 0;
	size_t size = sizeof(has_crc32);

	// The sysctlbyname() function requires a string identifier for the
	// CPU feature it tests. The Apple documentation lists the string
	// "hw.optional.armv8_crc32", which can be found here:
	// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619
	if (sysctlbyname("hw.optional.armv8_crc32", &has_crc32,
			&size, NULL, 0) != 0)
		return false;

	return has_crc32;

#else
	// If a runtime detection method cannot be found, then this must
	// be a compile time error. The checks in crc_common.h should ensure
	// a runtime detection method is always found if this function is
	// built. It would be possible to just return false here, but this
	// is inefficient for binary size and runtime since only the generic
	// method could ever be used.
#	error Runtime detection method unavailable.
#endif
}
#endif

#endif // LZMA_CRC32_ARM64_H