src/liblzma/common/memcmplen.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

// SPDX-License-Identifier: 0BSD

///////////////////////////////////////////////////////////////////////////////
//
/// \file       memcmplen.h
/// \brief      Optimized comparison of two buffers
//
//  Author:     Lasse Collin
//
///////////////////////////////////////////////////////////////////////////////

#ifndef LZMA_MEMCMPLEN_H
#define LZMA_MEMCMPLEN_H

#include "common.h"

#ifdef HAVE_IMMINTRIN_H
#	include <immintrin.h>
#endif

// Only include <intrin.h> if it is needed. The header is only needed
// on Windows when using an MSVC compatible compiler. The Intel compiler
// can use the intrinsics without the header file.
#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
		&& defined(_MSC_VER) \
		&& (defined(_M_X64) \
			|| defined(_M_ARM64) || defined(_M_ARM64EC)) \
		&& !defined(__INTEL_COMPILER)
#	include <intrin.h>
#endif


/// Find out how many equal bytes the two buffers have.
///
/// \param      buf1    First buffer
/// \param      buf2    Second buffer
/// \param      len     How many bytes have already been compared and will
///                     be assumed to match
/// \param      limit   How many bytes to compare at most, including the
///                     already-compared bytes. This must be significantly
///                     smaller than UINT32_MAX to avoid integer overflows.
///                     Up to LZMA_MEMCMPLEN_EXTRA bytes may be read past
///                     the specified limit from both buf1 and buf2.
///
/// \return     Number of equal bytes in the buffers is returned.
///             This is always at least len and at most limit.
///
/// \note       LZMA_MEMCMPLEN_EXTRA defines how many extra bytes may be read.
///             It's rounded up to 2^n. This extra amount needs to be
///             allocated in the buffers being used. It needs to be
///             initialized too to keep Valgrind quiet.
static lzma_always_inline uint32_t
lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2,
		uint32_t len, uint32_t limit)
{
	assert(len <= limit);
	assert(limit <= UINT32_MAX / 2);

#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
		&& (((TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) \
				&& (defined(__x86_64__) \
					|| defined(__aarch64__))) \
			|| (defined(__INTEL_COMPILER) && defined(__x86_64__)) \
			|| (defined(__INTEL_COMPILER) && defined(_M_X64)) \
			|| (defined(_MSC_VER) && (defined(_M_X64) \
				|| defined(_M_ARM64) || defined(_M_ARM64EC))))
	// This is only for x86-64 and ARM64 for now. This might be fine on
	// other 64-bit processors too. On big endian one should use xor
	// instead of subtraction and switch to __builtin_clzll().
	//
	// Reasons to use subtraction instead of xor:
	//
	//   - On some x86-64 processors (Intel Sandy Bridge to Tiger Lake),
	//     sub+jz and sub+jnz can be fused but xor+jz or xor+jnz cannot.
	//     Thus using subtraction has potential to be a tiny amount faster
	//     since the code checks if the quotient is non-zero.
	//
	//   - Some processors (Intel Pentium 4) used to have more ALU
	//     resources for add/sub instructions than and/or/xor.
	//
	// The processor info is based on Agner Fog's microarchitecture.pdf
	// version 2023-05-26. https://www.agner.org/optimize/
#define LZMA_MEMCMPLEN_EXTRA 8
	while (len < limit) {
		const uint64_t x = read64ne(buf1 + len) - read64ne(buf2 + len);
		if (x != 0) {
	// MSVC or Intel C compiler on Windows
#	if defined(_MSC_VER) || defined(__INTEL_COMPILER)
			unsigned long tmp;
			_BitScanForward64(&tmp, x);
			len += (uint32_t)tmp >> 3;
	// GCC, Clang, or Intel C compiler
#	else
			len += (uint32_t)__builtin_ctzll(x) >> 3;
#	endif
			return my_min(len, limit);
		}

		len += 8;
	}

	return limit;

#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
		&& defined(HAVE__MM_MOVEMASK_EPI8) \
		&& (defined(__SSE2__) \
			|| (defined(_MSC_VER) && defined(_M_IX86_FP) \
				&& _M_IX86_FP >= 2))
	// NOTE: This will use 128-bit unaligned access which
	// TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit,
	// but it's convenient here since this is x86-only.
	//
	// SSE2 version for 32-bit and 64-bit x86. On x86-64 the above
	// version is sometimes significantly faster and sometimes
	// slightly slower than this SSE2 version, so this SSE2
	// version isn't used on x86-64.
#	define LZMA_MEMCMPLEN_EXTRA 16
	while (len < limit) {
		const uint32_t x = 0xFFFF ^ (uint32_t)_mm_movemask_epi8(
			_mm_cmpeq_epi8(
			_mm_loadu_si128((const __m128i *)(buf1 + len)),
			_mm_loadu_si128((const __m128i *)(buf2 + len))));

		if (x != 0) {
			len += ctz32(x);
			return my_min(len, limit);
		}

		len += 16;
	}

	return limit;

#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && !defined(WORDS_BIGENDIAN)
	// Generic 32-bit little endian method
#	define LZMA_MEMCMPLEN_EXTRA 4
	while (len < limit) {
		uint32_t x = read32ne(buf1 + len) - read32ne(buf2 + len);
		if (x != 0) {
			if ((x & 0xFFFF) == 0) {
				len += 2;
				x >>= 16;
			}

			if ((x & 0xFF) == 0)
				++len;

			return my_min(len, limit);
		}

		len += 4;
	}

	return limit;

#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && defined(WORDS_BIGENDIAN)
	// Generic 32-bit big endian method
#	define LZMA_MEMCMPLEN_EXTRA 4
	while (len < limit) {
		uint32_t x = read32ne(buf1 + len) ^ read32ne(buf2 + len);
		if (x != 0) {
			if ((x & 0xFFFF0000) == 0) {
				len += 2;
				x <<= 16;
			}

			if ((x & 0xFF000000) == 0)
				++len;

			return my_min(len, limit);
		}

		len += 4;
	}

	return limit;

#else
	// Simple portable version that doesn't use unaligned access.
#	define LZMA_MEMCMPLEN_EXTRA 0
	while (len < limit && buf1[len] == buf2[len])
		++len;

	return len;
#endif
}

#endif