mysys/crc32/crc32_x86.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334

/* Copyright (c) 2020, 2021, MariaDB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */

/*
  Implementation of CRC32 (Ethernet) uing Intel PCLMULQDQ
  Ported from Intels work, see https://github.com/intel/soft-crc
*/

/*******************************************************************************
 Copyright (c) 2009-2018, Intel Corporation

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

     * Redistributions of source code must retain the above copyright notice,
       this list of conditions and the following disclaimer.
     * Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
     * Neither the name of Intel Corporation nor the names of its contributors
       may be used to endorse or promote products derived from this software
       without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/


#include <my_global.h>
#include <my_compiler.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stddef.h>

#ifdef __GNUC__
#include <x86intrin.h>
#elif defined(_MSC_VER)
#include <intrin.h>
#else
#error "unknown compiler"
#endif

/**
 * @brief Shifts left 128 bit register by specified number of bytes
 *
 * @param reg 128 bit value
 * @param num number of bytes to shift left \a reg by (0-16)
 *
 * @return \a reg << (\a num * 8)
 */
static inline __m128i xmm_shift_left(__m128i reg, const unsigned int num)
{
  static const MY_ALIGNED(16) uint8_t crc_xmm_shift_tab[48]= {
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
      0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};

  const __m128i *p= (const __m128i *) (crc_xmm_shift_tab + 16 - num);

  return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
}

struct crcr_pclmulqdq_ctx
{
  uint64_t rk1;
  uint64_t rk2;
  uint64_t rk5;
  uint64_t rk6;
  uint64_t rk7;
  uint64_t rk8;
};

/**
 * @brief Performs one folding round
 *
 * Logically function operates as follows:
 *     DATA = READ_NEXT_16BYTES();
 *     F1 = LSB8(FOLD)
 *     F2 = MSB8(FOLD)
 *     T1 = CLMUL(F1, RK1)
 *     T2 = CLMUL(F2, RK2)
 *     FOLD = XOR(T1, T2, DATA)
 *
 * @param data_block 16 byte data block
 * @param precomp precomputed rk1 constanst
 * @param fold running 16 byte folded data
 *
 * @return New 16 byte folded data
 */
static inline __m128i crcr32_folding_round(const __m128i data_block,
                                    const __m128i precomp, const __m128i fold)
{
  __m128i tmp0= _mm_clmulepi64_si128(fold, precomp, 0x01);
  __m128i tmp1= _mm_clmulepi64_si128(fold, precomp, 0x10);

  return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
}

/**
 * @brief Performs reduction from 128 bits to 64 bits
 *
 * @param data128 128 bits data to be reduced
 * @param precomp rk5 and rk6 precomputed constants
 *
 * @return data reduced to 64 bits
 */
static inline __m128i crcr32_reduce_128_to_64(__m128i data128, const __m128i precomp)
{
  __m128i tmp0, tmp1, tmp2;

  /* 64b fold */
  tmp0= _mm_clmulepi64_si128(data128, precomp, 0x00);
  tmp1= _mm_srli_si128(data128, 8);
  tmp0= _mm_xor_si128(tmp0, tmp1);

  /* 32b fold */
  tmp2= _mm_slli_si128(tmp0, 4);
  tmp1= _mm_clmulepi64_si128(tmp2, precomp, 0x10);

  return _mm_xor_si128(tmp1, tmp0);
}

/**
 * @brief Performs Barret's reduction from 64 bits to 32 bits
 *
 * @param data64 64 bits data to be reduced
 * @param precomp rk7 precomputed constant
 *
 * @return data reduced to 32 bits
 */
static inline uint32_t crcr32_reduce_64_to_32(__m128i data64, const __m128i precomp)
{
  static const MY_ALIGNED(16) uint32_t mask1[4]= {
      0xffffffff, 0xffffffff, 0x00000000, 0x00000000};
  static const MY_ALIGNED(16) uint32_t mask2[4]= {
      0x00000000, 0xffffffff, 0xffffffff, 0xffffffff};
  __m128i tmp0, tmp1, tmp2;

  tmp0= _mm_and_si128(data64, _mm_load_si128((__m128i *) mask2));

  tmp1= _mm_clmulepi64_si128(tmp0, precomp, 0x00);
  tmp1= _mm_xor_si128(tmp1, tmp0);
  tmp1= _mm_and_si128(tmp1, _mm_load_si128((__m128i *) mask1));

  tmp2= _mm_clmulepi64_si128(tmp1, precomp, 0x10);
  tmp2= _mm_xor_si128(tmp2, tmp1);
  tmp2= _mm_xor_si128(tmp2, tmp0);

  return _mm_extract_epi32(tmp2, 2);
}

/**
 * @brief Calculates reflected 32-bit CRC for given \a data block
 *        by applying folding and reduction methods.
 *
 * Algorithm operates on 32 bit CRCs.
 * Polynomials and initial values may need to be promoted to
 * 32 bits where required.
 *
 * @param crc initial CRC value (32 bit value)
 * @param data pointer to data block
 * @param data_len length of \a data block in bytes
 * @param params pointer to PCLMULQDQ CRC calculation context
 *
 * @return CRC for given \a data block (32 bits wide).
 */
static inline uint32_t crcr32_calc_pclmulqdq(const uint8_t *data, uint32_t data_len,
                                      uint32_t crc,
                                      const struct crcr_pclmulqdq_ctx *params)
{
  __m128i temp, fold, k;
  uint32_t n;

  DBUG_ASSERT(data != NULL || data_len == 0);
  DBUG_ASSERT(params);

  if (unlikely(data_len == 0))
    return crc;

  /**
   * Get CRC init value
   */
  temp= _mm_insert_epi32(_mm_setzero_si128(), crc, 0);

  /**
   * -------------------------------------------------
   * Folding all data into single 16 byte data block
   * Assumes: \a fold holds first 16 bytes of data
   */

  if (unlikely(data_len < 32))
  {
    if (unlikely(data_len == 16))
    {
      /* 16 bytes */
      fold= _mm_loadu_si128((__m128i *) data);
      fold= _mm_xor_si128(fold, temp);
      goto reduction_128_64;
    }
    if (unlikely(data_len < 16))
    {
      /* 0 to 15 bytes */
      MY_ALIGNED(16) uint8_t buffer[16];

      memset(buffer, 0, sizeof(buffer));
      memcpy(buffer, data, data_len);

      fold= _mm_load_si128((__m128i *) buffer);
      fold= _mm_xor_si128(fold, temp);
      if ((data_len < 4))
      {
        fold= xmm_shift_left(fold, 8 - data_len);
        goto barret_reduction;
      }
      fold= xmm_shift_left(fold, 16 - data_len);
      goto reduction_128_64;
    }
    /* 17 to 31 bytes */
    fold= _mm_loadu_si128((__m128i *) data);
    fold= _mm_xor_si128(fold, temp);
    n= 16;
    k= _mm_load_si128((__m128i *) (&params->rk1));
    goto partial_bytes;
  }

  /**
   * At least 32 bytes in the buffer
   */

  /**
   * Apply CRC initial value
   */
  fold= _mm_loadu_si128((const __m128i *) data);
  fold= _mm_xor_si128(fold, temp);

  /**
   * Main folding loop
   * - the last 16 bytes is processed separately
   */
  k= _mm_load_si128((__m128i *) (&params->rk1));
  for (n= 16; (n + 16) <= data_len; n+= 16)
  {
    temp= _mm_loadu_si128((__m128i *) &data[n]);
    fold= crcr32_folding_round(temp, k, fold);
  }

partial_bytes:
  if (likely(n < data_len))
  {
    static const MY_ALIGNED(16) uint32_t mask3[4]= {0x80808080, 0x80808080,
                                                   0x80808080, 0x80808080};
    static const MY_ALIGNED(16) uint8_t shf_table[32]= {
        0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
        0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
        0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    __m128i last16, a, b;

    last16= _mm_loadu_si128((const __m128i *) &data[data_len - 16]);

    temp= _mm_loadu_si128((const __m128i *) &shf_table[data_len & 15]);
    a= _mm_shuffle_epi8(fold, temp);

    temp= _mm_xor_si128(temp, _mm_load_si128((const __m128i *) mask3));
    b= _mm_shuffle_epi8(fold, temp);
    b= _mm_blendv_epi8(b, last16, temp);

    /* k = rk1 & rk2 */
    temp= _mm_clmulepi64_si128(a, k, 0x01);
    fold= _mm_clmulepi64_si128(a, k, 0x10);

    fold= _mm_xor_si128(fold, temp);
    fold= _mm_xor_si128(fold, b);
  }

  /**
   * -------------------------------------------------
   * Reduction 128 -> 32
   * Assumes: \a fold holds 128bit folded data
   */
reduction_128_64:
  k= _mm_load_si128((__m128i *) (&params->rk5));
  fold= crcr32_reduce_128_to_64(fold, k);

barret_reduction:
  k= _mm_load_si128((__m128i *) (&params->rk7));
  n= crcr32_reduce_64_to_32(fold, k);
  return n;
}

static const MY_ALIGNED(16) struct crcr_pclmulqdq_ctx ether_crc32_clmul= {
    0xccaa009e,  /**< rk1 */
    0x1751997d0, /**< rk2 */
    0xccaa009e,  /**< rk5 */
    0x163cd6124, /**< rk6 */
    0x1f7011640, /**< rk7 */
    0x1db710641  /**< rk8 */
};

/**
 * @brief Calculates Ethernet CRC32 using PCLMULQDQ method.
 *
 * @param data pointer to data block to calculate CRC for
 * @param data_len size of data block
 *
 * @return New CRC value
 */
unsigned int crc32_pclmul(unsigned int crc32, const void *buf, size_t len)
{
  return ~crcr32_calc_pclmulqdq(buf, (uint32_t)len, ~crc32, &ether_crc32_clmul);
}