src/liblzma/check/crc_x86_clmul.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428

// SPDX-License-Identifier: 0BSD

///////////////////////////////////////////////////////////////////////////////
//
/// \file       crc_x86_clmul.h
/// \brief      CRC32 and CRC64 implementations using CLMUL instructions.
///
/// The CRC32 and CRC64 implementations use 32/64-bit x86 SSSE3, SSE4.1, and
/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too.
///
/// They were derived from
/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation
/// and the public domain code from https://github.com/rawrunprotected/crc
/// (URLs were checked on 2023-10-14).
///
/// While this file has both CRC32 and CRC64 implementations, only one
/// should be built at a time to ensure that crc_simd_body() is inlined
/// even with compilers with which lzma_always_inline expands to plain inline.
/// The version to build is selected by defining BUILDING_CRC32_CLMUL or
/// BUILDING_CRC64_CLMUL before including this file.
///
/// FIXME: Builds for 32-bit x86 use the assembly .S files by default
/// unless configured with --disable-assembler. Even then the lookup table
/// isn't omitted in crc64_table.c since it doesn't know that assembly
/// code has been disabled.
//
//  Authors:    Ilya Kurdyukov
//              Hans Jansen
//              Lasse Collin
//              Jia Tan
//
///////////////////////////////////////////////////////////////////////////////

// This file must not be included more than once.
#ifdef LZMA_CRC_X86_CLMUL_H
#	error crc_x86_clmul.h was included twice.
#endif
#define LZMA_CRC_X86_CLMUL_H

#include <immintrin.h>

#if defined(_MSC_VER)
#	include <intrin.h>
#elif defined(HAVE_CPUID_H)
#	include <cpuid.h>
#endif


// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
// define __GNUC__ but the attribute must not be used with them.
// The new Clang-based ICX needs the attribute.
//
// NOTE: Build systems check for this too, keep them in sync with this.
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
#	define crc_attr_target \
		__attribute__((__target__("ssse3,sse4.1,pclmul")))
#else
#	define crc_attr_target
#endif


#define MASK_L(in, mask, r) r = _mm_shuffle_epi8(in, mask)

#define MASK_H(in, mask, r) \
	r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign))

#define MASK_LH(in, mask, low, high) \
	MASK_L(in, mask, low); \
	MASK_H(in, mask, high)


crc_attr_target
crc_attr_no_sanitize_address
static lzma_always_inline void
crc_simd_body(const uint8_t *buf, const size_t size, __m128i *v0, __m128i *v1,
		const __m128i vfold16, const __m128i initial_crc)
{
	// Create a vector with 8-bit values 0 to 15. This is used to
	// construct control masks for _mm_blendv_epi8 and _mm_shuffle_epi8.
	const __m128i vramp = _mm_setr_epi32(
			0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c);

	// This is used to inverse the control mask of _mm_shuffle_epi8
	// so that bytes that wouldn't be picked with the original mask
	// will be picked and vice versa.
	const __m128i vsign = _mm_set1_epi8(-0x80);

	// Memory addresses A to D and the distances between them:
	//
	//     A           B     C         D
	//     [skip_start][size][skip_end]
	//     [     size2      ]
	//
	// A and D are 16-byte aligned. B and C are 1-byte aligned.
	// skip_start and skip_end are 0-15 bytes. size is at least 1 byte.
	//
	// A = aligned_buf will initially point to this address.
	// B = The address pointed by the caller-supplied buf.
	// C = buf + size == aligned_buf + size2
	// D = buf + size + skip_end == aligned_buf + size2 + skip_end
	const size_t skip_start = (size_t)((uintptr_t)buf & 15);
	const size_t skip_end = (size_t)((0U - (uintptr_t)(buf + size)) & 15);
	const __m128i *aligned_buf = (const __m128i *)(
			(uintptr_t)buf & ~(uintptr_t)15);

	// If size2 <= 16 then the whole input fits into a single 16-byte
	// vector. If size2 > 16 then at least two 16-byte vectors must
	// be processed. If size2 > 16 && size <= 16 then there is only
	// one 16-byte vector's worth of input but it is unaligned in memory.
	//
	// NOTE: There is no integer overflow here if the arguments
	// are valid. If this overflowed, buf + size would too.
	const size_t size2 = skip_start + size;

	// Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8:
	// The first skip_start or skip_end bytes in the vectors will have
	// the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8
	// will produce zeros for these positions. (Bitwise-xor of these
	// masks with vsign will produce the opposite behavior.)
	const __m128i mask_start
			= _mm_sub_epi8(vramp, _mm_set1_epi8((char)skip_start));
	const __m128i mask_end
			= _mm_sub_epi8(vramp, _mm_set1_epi8((char)skip_end));

	// Get the first 1-16 bytes into data0. If loading less than 16
	// bytes, the bytes are loaded to the high bits of the vector and
	// the least significant positions are filled with zeros.
	const __m128i data0 = _mm_blendv_epi8(_mm_load_si128(aligned_buf),
			_mm_setzero_si128(), mask_start);
	aligned_buf++;

	__m128i v2, v3;

#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
	if (size <= 16) {
		// Right-shift initial_crc by 1-16 bytes based on "size"
		// and store the result in v1 (high bytes) and v0 (low bytes).
		//
		// NOTE: The highest 8 bytes of initial_crc are zeros so
		// v1 will be filled with zeros if size >= 8. The highest
		// 8 bytes of v1 will always become zeros.
		//
		// [      v1      ][      v0      ]
		//  [ initial_crc  ]                  size == 1
		//   [ initial_crc  ]                 size == 2
		//                [ initial_crc  ]    size == 15
		//                 [ initial_crc  ]   size == 16 (all in v0)
		const __m128i mask_low = _mm_add_epi8(
				vramp, _mm_set1_epi8((char)(size - 16)));
		MASK_LH(initial_crc, mask_low, *v0, *v1);

		if (size2 <= 16) {
			// There are 1-16 bytes of input and it is all
			// in data0. Copy the input bytes to v3. If there
			// are fewer than 16 bytes, the low bytes in v3
			// will be filled with zeros. That is, the input
			// bytes are stored to the same position as
			// (part of) initial_crc is in v0.
			MASK_L(data0, mask_end, v3);
		} else {
			// There are 2-16 bytes of input but not all bytes
			// are in data0.
			const __m128i data1 = _mm_load_si128(aligned_buf);

			// Collect the 2-16 input bytes from data0 and data1
			// to v2 and v3, and bitwise-xor them with the
			// low bits of initial_crc in v0. Note that the
			// the second xor is below this else-block as it
			// is shared with the other branch.
			MASK_H(data0, mask_end, v2);
			MASK_L(data1, mask_end, v3);
			*v0 = _mm_xor_si128(*v0, v2);
		}

		*v0 = _mm_xor_si128(*v0, v3);
		*v1 = _mm_alignr_epi8(*v1, *v0, 8);
	} else
#endif
	{
		// There is more than 16 bytes of input.
		const __m128i data1 = _mm_load_si128(aligned_buf);
		const __m128i *end = (const __m128i*)(
				(const char *)aligned_buf - 16 + size2);
		aligned_buf++;

		MASK_LH(initial_crc, mask_start, *v0, *v1);
		*v0 = _mm_xor_si128(*v0, data0);
		*v1 = _mm_xor_si128(*v1, data1);

		while (aligned_buf < end) {
			*v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(
					*v0, vfold16, 0x00));
			*v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(
					*v0, vfold16, 0x11));
			*v1 = _mm_load_si128(aligned_buf++);
		}

		if (aligned_buf != end) {
			MASK_H(*v0, mask_end, v2);
			MASK_L(*v0, mask_end, *v0);
			MASK_L(*v1, mask_end, v3);
			*v1 = _mm_or_si128(v2, v3);
		}

		*v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(
				*v0, vfold16, 0x00));
		*v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(
				*v0, vfold16, 0x11));
		*v1 = _mm_srli_si128(*v0, 8);
	}
}


/////////////////////
// x86 CLMUL CRC32 //
/////////////////////

/*
// These functions were used to generate the constants
// at the top of crc32_arch_optimized().
static uint64_t
calc_lo(uint64_t p, uint64_t a, int n)
{
	uint64_t b = 0; int i;
	for (i = 0; i < n; i++) {
		b = b >> 1 | (a & 1) << (n - 1);
		a = (a >> 1) ^ ((0 - (a & 1)) & p);
	}
	return b;
}

// same as ~crc(&a, sizeof(a), ~0)
static uint64_t
calc_hi(uint64_t p, uint64_t a, int n)
{
	int i;
	for (i = 0; i < n; i++)
		a = (a >> 1) ^ ((0 - (a & 1)) & p);
	return a;
}
*/

#ifdef BUILDING_CRC32_CLMUL

crc_attr_target
crc_attr_no_sanitize_address
static uint32_t
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
{
#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
	// The code assumes that there is at least one byte of input.
	if (size == 0)
		return crc;
#endif

	// uint32_t poly = 0xedb88320;
	const int64_t p = 0x1db710640; // p << 1
	const int64_t mu = 0x1f7011641; // calc_lo(p, p, 32) << 1 | 1
	const int64_t k5 = 0x163cd6124; // calc_hi(p, p, 32) << 1
	const int64_t k4 = 0x0ccaa009e; // calc_hi(p, p, 64) << 1
	const int64_t k3 = 0x1751997d0; // calc_hi(p, p, 128) << 1

	const __m128i vfold4 = _mm_set_epi64x(mu, p);
	const __m128i vfold8 = _mm_set_epi64x(0, k5);
	const __m128i vfold16 = _mm_set_epi64x(k4, k3);

	__m128i v0, v1, v2;

	crc_simd_body(buf,  size, &v0, &v1, vfold16,
			_mm_cvtsi32_si128((int32_t)~crc));

	v1 = _mm_xor_si128(
			_mm_clmulepi64_si128(v0, vfold16, 0x10), v1); // xxx0
	v2 = _mm_shuffle_epi32(v1, 0xe7); // 0xx0
	v0 = _mm_slli_epi64(v1, 32);  // [0]
	v0 = _mm_clmulepi64_si128(v0, vfold8, 0x00);
	v0 = _mm_xor_si128(v0, v2);   // [1] [2]
	v2 = _mm_clmulepi64_si128(v0, vfold4, 0x10);
	v2 = _mm_clmulepi64_si128(v2, vfold4, 0x00);
	v0 = _mm_xor_si128(v0, v2);   // [2]
	return ~(uint32_t)_mm_extract_epi32(v0, 2);
}
#endif // BUILDING_CRC32_CLMUL


/////////////////////
// x86 CLMUL CRC64 //
/////////////////////

/*
// These functions were used to generate the constants
// at the top of crc64_arch_optimized().
static uint64_t
calc_lo(uint64_t poly)
{
	uint64_t a = poly;
	uint64_t b = 0;

	for (unsigned i = 0; i < 64; ++i) {
		b = (b >> 1) | (a << 63);
		a = (a >> 1) ^ (a & 1 ? poly : 0);
	}

	return b;
}

static uint64_t
calc_hi(uint64_t poly, uint64_t a)
{
	for (unsigned i = 0; i < 64; ++i)
		a = (a >> 1) ^ (a & 1 ? poly : 0);

	return a;
}
*/

#ifdef BUILDING_CRC64_CLMUL

// MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC
// code when optimizations are enabled (release build). According to the bug
// report, the ebx register is corrupted and the calculated result is wrong.
// Trying to workaround the problem with "__asm mov ebx, ebx" didn't help.
// The following pragma works and performance is still good. x86-64 builds
// and CRC32 CLMUL aren't affected by this problem. The problem does not
// happen in crc_simd_body() either (which is shared with CRC32 CLMUL anyway).
//
// NOTE: Another pragma after crc64_arch_optimized() restores
// the optimizations. If the #if condition here is updated,
// the other one must be updated too.
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
		&& defined(_M_IX86)
#	pragma optimize("g", off)
#endif

crc_attr_target
crc_attr_no_sanitize_address
static uint64_t
crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)
{
#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
	// The code assumes that there is at least one byte of input.
	if (size == 0)
		return crc;
#endif

	// const uint64_t poly = 0xc96c5795d7870f42; // CRC polynomial
	const uint64_t p  = 0x92d8af2baf0e1e85; // (poly << 1) | 1
	const uint64_t mu = 0x9c3e466c172963d5; // (calc_lo(poly) << 1) | 1
	const uint64_t k2 = 0xdabe95afc7875f40; // calc_hi(poly, 1)
	const uint64_t k1 = 0xe05dd497ca393ae4; // calc_hi(poly, k2)

	const __m128i vfold8 = _mm_set_epi64x((int64_t)p, (int64_t)mu);
	const __m128i vfold16 = _mm_set_epi64x((int64_t)k2, (int64_t)k1);

	__m128i v0, v1, v2;

#if defined(__i386__) || defined(_M_IX86)
	crc_simd_body(buf,  size, &v0, &v1, vfold16,
			_mm_set_epi64x(0, (int64_t)~crc));
#else
	// GCC and Clang would produce good code with _mm_set_epi64x
	// but MSVC needs _mm_cvtsi64_si128 on x86-64.
	crc_simd_body(buf,  size, &v0, &v1, vfold16,
			_mm_cvtsi64_si128((int64_t)~crc));
#endif

	v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold16, 0x10), v1);
	v0 = _mm_clmulepi64_si128(v1, vfold8, 0x00);
	v2 = _mm_clmulepi64_si128(v0, vfold8, 0x10);
	v0 = _mm_xor_si128(_mm_xor_si128(v1, _mm_slli_si128(v0, 8)), v2);

#if defined(__i386__) || defined(_M_IX86)
	return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
			(uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
#else
	return ~(uint64_t)_mm_extract_epi64(v0, 1);
#endif
}

#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
		&& defined(_M_IX86)
#	pragma optimize("", on)
#endif

#endif // BUILDING_CRC64_CLMUL


// Inlining this function duplicates the function body in crc32_resolve() and
// crc64_resolve(), but this is acceptable because this is a tiny function.
static inline bool
is_arch_extension_supported(void)
{
	int success = 1;
	uint32_t r[4]; // eax, ebx, ecx, edx

#if defined(_MSC_VER)
	// This needs <intrin.h> with MSVC. ICC has it as a built-in
	// on all platforms.
	__cpuid(r, 1);
#elif defined(HAVE_CPUID_H)
	// Compared to just using __asm__ to run CPUID, this also checks
	// that CPUID is supported and saves and restores ebx as that is
	// needed with GCC < 5 with position-independent code (PIC).
	success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
#else
	// Just a fallback that shouldn't be needed.
	__asm__("cpuid\n\t"
			: "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
			: "a"(1), "c"(0));
#endif

	// Returns true if these are supported:
	// CLMUL (bit 1 in ecx)
	// SSSE3 (bit 9 in ecx)
	// SSE4.1 (bit 19 in ecx)
	const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
	return success && (r[2] & ecx_mask) == ecx_mask;

	// Alternative methods that weren't used:
	//   - ICC's _may_i_use_cpu_feature: the other methods should work too.
	//   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
	//
	// CPUID decding is needed with MSVC anyway and older GCC. This keeps
	// the feature checks in the build system simpler too. The nice thing
	// about __builtin_cpu_supports would be that it generates very short
	// code as is it only reads a variable set at startup but a few bytes
	// doesn't matter here.
}