src/hash/crc32/crc32_ppc64le.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705

// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// The vectorized implementation found below is a derived work
// from code written by Anton Blanchard <anton@au.ibm.com> found
// at https://github.com/antonblanchard/crc32-vpmsum.  The original
// is dual licensed under GPL and Apache 2.  As the copyright holder
// for the work, IBM has contributed this new work under
// the golang license.

// Changes include porting to Go assembler with modifications for
// the Go ABI for ppc64le.

#include "textflag.h"

#define POWER8_OFFSET 132

#define off16	R16
#define off32	R17
#define off48	R18
#define off64	R19
#define off80	R20
#define off96	R21
#define	off112	R22

#define const1	V24
#define const2	V25

#define byteswap	V26
#define mask_32bit	V27
#define mask_64bit	V28
#define zeroes		V29

#define MAX_SIZE	32*1024
#define REFLECT

TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
	MOVWZ	crc+0(FP), R3   // incoming crc
	MOVD    table8+8(FP), R4   // *Table
	MOVD    p+16(FP), R5
	MOVD    p_len+24(FP), R6 // p len

	CMP     $0,R6           // len == 0?
	BNE     start
	MOVW    R3,ret+40(FP)   // return crc
	RET

start:
	NOR     R3,R3,R7        // ^crc
	MOVWZ	R7,R7		// 32 bits
	CMP	R6,$16
	MOVD	R6,CTR
	BLT	short
	SRAD    $3,R6,R8        // 8 byte chunks
	MOVD    R8,CTR

loop:
	MOVWZ	0(R5),R8	// 0-3 bytes of p ?Endian?
	MOVWZ	4(R5),R9	// 4-7 bytes of p
	MOVD	R4,R10		// &tab[0]
	XOR	R7,R8,R7	// crc ^= byte[0:3]
	RLDICL	$40,R9,$56,R17	// p[7]
	SLD	$2,R17,R17	// p[7]*4
	RLDICL	$40,R7,$56,R8	// crc>>24
	ADD	R17,R10,R17	// &tab[0][p[7]]
	SLD	$2,R8,R8	// crc>>24*4
	RLDICL	$48,R9,$56,R18	// p[6]
	SLD	$2,R18,R18	// p[6]*4
	ADD	$1024,R10,R10	// tab[1]
	MOVWZ	0(R17),R21	// tab[0][p[7]]
	RLDICL	$56,R9,$56,R19	// p[5]
	ADD	R10,R18,R18	// &tab[1][p[6]]
	SLD	$2,R19,R19	// p[5]*4:1
	MOVWZ	0(R18),R22	// tab[1][p[6]]
	ADD	$1024,R10,R10	// tab[2]
	XOR	R21,R22,R21	// xor done R22
	ADD	R19,R10,R19	// &tab[2][p[5]]
	ANDCC	$255,R9,R20	// p[4] ??
	SLD	$2,R20,R20	// p[4]*4
	MOVWZ	0(R19),R23	// tab[2][p[5]]
	ADD	$1024,R10,R10	// &tab[3]
	ADD	R20,R10,R20	// tab[3][p[4]]
	XOR	R21,R23,R21	// xor done R23
	ADD	$1024,R10,R10	// &tab[4]
	MOVWZ	0(R20),R24	// tab[3][p[4]]
	ADD	R10,R8,R23	// &tab[4][crc>>24]
	XOR	R21,R24,R21	// xor done R24
	MOVWZ	0(R23),R25	// tab[4][crc>>24]
	RLDICL	$48,R7,$56,R24	// crc>>16&0xFF
	XOR	R21,R25,R21	// xor done R25
	ADD	$1024,R10,R10	// &tab[5]
	SLD	$2,R24,R24	// crc>>16&0xFF*4
	ADD	R24,R10,R24	// &tab[5][crc>>16&0xFF]
	MOVWZ	0(R24),R26	// tab[5][crc>>16&0xFF]
	XOR	R21,R26,R21	// xor done R26
	RLDICL	$56,R7,$56,R25	// crc>>8
	ADD	$1024,R10,R10	// &tab[6]
	SLD	$2,R25,R25	// crc>>8&FF*2
	ADD	R25,R10,R25	// &tab[6][crc>>8&0xFF]
	MOVBZ   R7,R26          // crc&0xFF
	ADD     $1024,R10,R10   // &tab[7]
	MOVWZ	0(R25),R27	// tab[6][crc>>8&0xFF]
	SLD	$2,R26,R26	// crc&0xFF*2
	XOR	R21,R27,R21	// xor done R27
	ADD	R26,R10,R26	// &tab[7][crc&0xFF]
	ADD     $8,R5           // p = p[8:]
	MOVWZ	0(R26),R28	// tab[7][crc&0xFF]
	XOR	R21,R28,R21	// xor done R28
	MOVWZ	R21,R7		// crc for next round
	BC	16,0,loop	// next 8 bytes
	ANDCC	$7,R6,R8	// any leftover bytes
	BEQ	done		// none --> done
	MOVD	R8,CTR		// byte count
	PCALIGN $16             // align short loop
short:
	MOVBZ   0(R5),R8        // get v
	MOVBZ   R7,R9           // byte(crc) -> R8 BE vs LE?
	SRD     $8,R7,R14       // crc>>8
	XOR     R8,R9,R8        // byte(crc)^v -> R8
	ADD	$1,R5		// ptr to next v
	SLD     $2,R8           // convert index-> bytes
	ADD     R8,R4,R9        // &tab[byte(crc)^v]
	MOVWZ   0(R9),R10       // tab[byte(crc)^v]
	XOR     R10,R14,R7       // loop crc in R7
	BC      16,0,short
done:
	NOR     R7,R7,R7        // ^crc
	MOVW    R7,ret+40(FP)   // return crc
	RET

#ifdef BYTESWAP_DATA
DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908

GLOBL ·byteswapcons+0(SB),RODATA,$16
#endif

TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
	MOVWZ	crc+0(FP), R3   // incoming crc
	MOVWZ	ctab+4(FP), R14   // crc poly id
	MOVD    p+8(FP), R4
	MOVD    p_len+16(FP), R5 // p len

	// R3 = incoming crc
	// R14 = constant table identifier
	// R5 = address of bytes
	// R6 = length of bytes

	// defines for index loads

	MOVD	$16,off16
	MOVD	$32,off32
	MOVD	$48,off48
	MOVD	$64,off64
	MOVD	$80,off80
	MOVD	$96,off96
	MOVD	$112,off112
	MOVD	$0,R15

	MOVD	R3,R10	// save initial crc

	NOR	R3,R3,R3  // ^crc
	MOVWZ	R3,R3	// 32 bits
	VXOR	zeroes,zeroes,zeroes  // clear the V reg
	VSPLTISW $-1,V0
	VSLDOI	$4,V29,V0,mask_32bit
	VSLDOI	$8,V29,V0,mask_64bit

	VXOR	V8,V8,V8
	MTVSRD	R3,VS40	// crc initial value VS40 = V8

#ifdef REFLECT
	VSLDOI	$8,zeroes,V8,V8  // or: VSLDOI V29,V8,V27,4 for top 32 bits?
#else
	VSLDOI	$4,V8,zeroes,V8
#endif

#ifdef BYTESWAP_DATA
	MOVD    $·byteswapcons(SB),R3
	LVX	(R3),byteswap
#endif

	CMPU	R5,$256		// length of bytes
	BLT	short

	RLDICR	$0,R5,$56,R6 // chunk to process

	// First step for larger sizes
l1:	MOVD	$32768,R7
	MOVD	R7,R9
	CMP	R6,R7   // compare R6, R7 (MAX SIZE)
	BGT	top	// less than MAX, just do remainder
	MOVD	R6,R7
top:
	SUB	R7,R6,R6

	// mainloop does 128 bytes at a time
	SRD	$7,R7

	// determine the offset into the constants table to start with.
	// Each constant is 128 bytes, used against 16 bytes of data.
	SLD	$4,R7,R8
	SRD	$3,R9,R9
	SUB	R8,R9,R8

	// The last iteration is reduced in a separate step
	ADD	$-1,R7
	MOVD	R7,CTR

	// Determine which constant table (depends on poly)
	CMP	R14,$1
	BNE	castTable
	MOVD	$·IEEEConst(SB),R3
	BR	startConst
castTable:
	MOVD	$·CastConst(SB),R3

startConst:
	ADD	R3,R8,R3	// starting point in constants table

	VXOR	V0,V0,V0	// clear the V regs
	VXOR	V1,V1,V1
	VXOR	V2,V2,V2
	VXOR	V3,V3,V3
	VXOR	V4,V4,V4
	VXOR	V5,V5,V5
	VXOR	V6,V6,V6
	VXOR	V7,V7,V7

	LVX	(R3),const1	// loading constant values

	CMP	R15,$1		// Identify warm up pass
	BEQ	next

	// First warm up pass: load the bytes to process
	LVX	(R4),V16
	LVX	(R4+off16),V17
	LVX	(R4+off32),V18
	LVX	(R4+off48),V19
	LVX	(R4+off64),V20
	LVX	(R4+off80),V21
	LVX	(R4+off96),V22
	LVX	(R4+off112),V23
	ADD	$128,R4		// bump up to next 128 bytes in buffer

	VXOR	V16,V8,V16	// xor in initial CRC in V8

next:
	BC	18,0,first_warm_up_done

	ADD	$16,R3		// bump up to next constants
	LVX	(R3),const2	// table values

	VPMSUMD	V16,const1,V8 // second warm up pass
	LVX	(R4),V16	// load from buffer
	OR	$0,R2,R2

	VPMSUMD	V17,const1,V9	// vpmsumd with constants
	LVX	(R4+off16),V17	// load next from buffer
	OR	$0,R2,R2

	VPMSUMD	V18,const1,V10	// vpmsumd with constants
	LVX	(R4+off32),V18	// load next from buffer
	OR	$0,R2,R2

	VPMSUMD	V19,const1,V11	// vpmsumd with constants
	LVX	(R4+off48),V19	// load next from buffer
	OR	$0,R2,R2

	VPMSUMD	V20,const1,V12	// vpmsumd with constants
	LVX	(R4+off64),V20	// load next from buffer
	OR	$0,R2,R2

	VPMSUMD	V21,const1,V13	// vpmsumd with constants
	LVX	(R4+off80),V21	// load next from buffer
	OR	$0,R2,R2

	VPMSUMD	V22,const1,V14	// vpmsumd with constants
	LVX	(R4+off96),V22	// load next from buffer
	OR	$0,R2,R2

	VPMSUMD	V23,const1,V15	// vpmsumd with constants
	LVX	(R4+off112),V23	// load next from buffer

	ADD	$128,R4		// bump up to next 128 bytes in buffer

	BC	18,0,first_cool_down

cool_top:
	LVX	(R3),const1	// constants
	ADD	$16,R3		// inc to next constants
	OR	$0,R2,R2

	VXOR	V0,V8,V0	// xor in previous vpmsumd
	VPMSUMD	V16,const2,V8	// vpmsumd with constants
	LVX	(R4),V16	// buffer
	OR	$0,R2,R2

	VXOR	V1,V9,V1	// xor in previous
	VPMSUMD	V17,const2,V9	// vpmsumd with constants
	LVX	(R4+off16),V17	// next in buffer
	OR	$0,R2,R2

	VXOR	V2,V10,V2	// xor in previous
	VPMSUMD	V18,const2,V10	// vpmsumd with constants
	LVX	(R4+off32),V18	// next in buffer
	OR	$0,R2,R2

	VXOR	V3,V11,V3	// xor in previous
	VPMSUMD	V19,const2,V11	// vpmsumd with constants
	LVX	(R4+off48),V19	// next in buffer
	LVX	(R3),const2	// get next constant
	OR	$0,R2,R2

	VXOR	V4,V12,V4	// xor in previous
	VPMSUMD	V20,const1,V12	// vpmsumd with constants
	LVX	(R4+off64),V20	// next in buffer
	OR	$0,R2,R2

	VXOR	V5,V13,V5	// xor in previous
	VPMSUMD	V21,const1,V13	// vpmsumd with constants
	LVX	(R4+off80),V21	// next in buffer
	OR	$0,R2,R2

	VXOR	V6,V14,V6	// xor in previous
	VPMSUMD	V22,const1,V14	// vpmsumd with constants
	LVX	(R4+off96),V22	// next in buffer
	OR	$0,R2,R2

	VXOR	V7,V15,V7	// xor in previous
	VPMSUMD	V23,const1,V15	// vpmsumd with constants
	LVX	(R4+off112),V23	// next in buffer

	ADD	$128,R4		// bump up buffer pointer
	BC	16,0,cool_top	// are we done?

first_cool_down:

	// load the constants
	// xor in the previous value
	// vpmsumd the result with constants

	LVX	(R3),const1
	ADD	$16,R3

	VXOR	V0,V8,V0
	VPMSUMD V16,const1,V8
	OR	$0,R2,R2

	VXOR	V1,V9,V1
	VPMSUMD	V17,const1,V9
	OR	$0,R2,R2

	VXOR	V2,V10,V2
	VPMSUMD	V18,const1,V10
	OR	$0,R2,R2

	VXOR	V3,V11,V3
	VPMSUMD	V19,const1,V11
	OR	$0,R2,R2

	VXOR	V4,V12,V4
	VPMSUMD	V20,const1,V12
	OR	$0,R2,R2

	VXOR	V5,V13,V5
	VPMSUMD	V21,const1,V13
	OR	$0,R2,R2

	VXOR	V6,V14,V6
	VPMSUMD	V22,const1,V14
	OR	$0,R2,R2

	VXOR	V7,V15,V7
	VPMSUMD	V23,const1,V15
	OR	$0,R2,R2

second_cool_down:

	VXOR    V0,V8,V0
	VXOR    V1,V9,V1
	VXOR    V2,V10,V2
	VXOR    V3,V11,V3
	VXOR    V4,V12,V4
	VXOR    V5,V13,V5
	VXOR    V6,V14,V6
	VXOR    V7,V15,V7

#ifdef REFLECT
	VSLDOI  $4,V0,zeroes,V0
	VSLDOI  $4,V1,zeroes,V1
	VSLDOI  $4,V2,zeroes,V2
	VSLDOI  $4,V3,zeroes,V3
	VSLDOI  $4,V4,zeroes,V4
	VSLDOI  $4,V5,zeroes,V5
	VSLDOI  $4,V6,zeroes,V6
	VSLDOI  $4,V7,zeroes,V7
#endif

	LVX	(R4),V8
	LVX	(R4+off16),V9
	LVX	(R4+off32),V10
	LVX	(R4+off48),V11
	LVX	(R4+off64),V12
	LVX	(R4+off80),V13
	LVX	(R4+off96),V14
	LVX	(R4+off112),V15

	ADD	$128,R4

	VXOR	V0,V8,V16
	VXOR	V1,V9,V17
	VXOR	V2,V10,V18
	VXOR	V3,V11,V19
	VXOR	V4,V12,V20
	VXOR	V5,V13,V21
	VXOR	V6,V14,V22
	VXOR	V7,V15,V23

	MOVD    $1,R15
	CMP     $0,R6
	ADD     $128,R6

	BNE	l1
	ANDCC   $127,R5
	SUBC	R5,$128,R6
	ADD	R3,R6,R3

	SRD	$4,R5,R7
	MOVD	R7,CTR
	LVX	(R3),V0
	LVX	(R3+off16),V1
	LVX	(R3+off32),V2
	LVX	(R3+off48),V3
	LVX	(R3+off64),V4
	LVX	(R3+off80),V5
	LVX	(R3+off96),V6
	LVX	(R3+off112),V7

	ADD	$128,R3

	VPMSUMW	V16,V0,V0
	VPMSUMW	V17,V1,V1
	VPMSUMW	V18,V2,V2
	VPMSUMW	V19,V3,V3
	VPMSUMW	V20,V4,V4
	VPMSUMW	V21,V5,V5
	VPMSUMW	V22,V6,V6
	VPMSUMW	V23,V7,V7

	// now reduce the tail

	CMP	$0,R7
	BEQ	next1

	LVX	(R4),V16
	LVX	(R3),V17
	VPMSUMW	V16,V17,V16
	VXOR	V0,V16,V0
	BC	18,0,next1

	LVX	(R4+off16),V16
	LVX	(R3+off16),V17
	VPMSUMW	V16,V17,V16
	VXOR	V0,V16,V0
	BC	18,0,next1

	LVX	(R4+off32),V16
	LVX	(R3+off32),V17
	VPMSUMW	V16,V17,V16
	VXOR	V0,V16,V0
	BC	18,0,next1

	LVX	(R4+off48),V16
	LVX	(R3+off48),V17
	VPMSUMW	V16,V17,V16
	VXOR	V0,V16,V0
	BC	18,0,next1

	LVX	(R4+off64),V16
	LVX	(R3+off64),V17
	VPMSUMW	V16,V17,V16
	VXOR	V0,V16,V0
	BC	18,0,next1

	LVX	(R4+off80),V16
	LVX	(R3+off80),V17
	VPMSUMW	V16,V17,V16
	VXOR	V0,V16,V0
	BC	18,0,next1

	LVX	(R4+off96),V16
	LVX	(R3+off96),V17
	VPMSUMW	V16,V17,V16
	VXOR	V0,V16,V0

next1:
	VXOR	V0,V1,V0
	VXOR	V2,V3,V2
	VXOR	V4,V5,V4
	VXOR	V6,V7,V6
	VXOR	V0,V2,V0
	VXOR	V4,V6,V4
	VXOR	V0,V4,V0

barrett_reduction:

	CMP	R14,$1
	BNE	barcstTable
	MOVD	$·IEEEBarConst(SB),R3
	BR	startbarConst
barcstTable:
	MOVD    $·CastBarConst(SB),R3

startbarConst:
	LVX	(R3),const1
	LVX	(R3+off16),const2

	VSLDOI	$8,V0,V0,V1
	VXOR	V0,V1,V0

#ifdef REFLECT
	VSPLTISB $1,V1
	VSL	V0,V1,V0
#endif

	VAND	V0,mask_64bit,V0

#ifndef	REFLECT

	VPMSUMD	V0,const1,V1
	VSLDOI	$8,zeroes,V1,V1
	VPMSUMD	V1,const2,V1
	VXOR	V0,V1,V0
	VSLDOI	$8,V0,zeroes,V0

#else

	VAND	V0,mask_32bit,V1
	VPMSUMD	V1,const1,V1
	VAND	V1,mask_32bit,V1
	VPMSUMD	V1,const2,V1
	VXOR	V0,V1,V0
	VSLDOI  $4,V0,zeroes,V0

#endif

	MFVSRD	VS32,R3 // VS32 = V0

	NOR	R3,R3,R3 // return ^crc
	MOVW	R3,ret+32(FP)
	RET

first_warm_up_done:

	LVX	(R3),const1
	ADD	$16,R3

	VPMSUMD	V16,const1,V8
	VPMSUMD	V17,const1,V9
	VPMSUMD	V18,const1,V10
	VPMSUMD	V19,const1,V11
	VPMSUMD	V20,const1,V12
	VPMSUMD	V21,const1,V13
	VPMSUMD	V22,const1,V14
	VPMSUMD	V23,const1,V15

	BR	second_cool_down

short:
	CMP	$0,R5
	BEQ	zero

	// compute short constants

	CMP     R14,$1
	BNE     castshTable
	MOVD    $·IEEEConst(SB),R3
	ADD	$4080,R3
	BR      startshConst
castshTable:
	MOVD    $·CastConst(SB),R3
	ADD	$4080,R3

startshConst:
	SUBC	R5,$256,R6	// sub from 256
	ADD	R3,R6,R3

	// calculate where to start

	SRD	$4,R5,R7
	MOVD	R7,CTR

	VXOR	V19,V19,V19
	VXOR	V20,V20,V20

	LVX	(R4),V0
	LVX	(R3),V16
	VXOR	V0,V8,V0
	VPMSUMW	V0,V16,V0
	BC	18,0,v0

	LVX	(R4+off16),V1
	LVX	(R3+off16),V17
	VPMSUMW	V1,V17,V1
	BC	18,0,v1

	LVX	(R4+off32),V2
	LVX	(R3+off32),V16
	VPMSUMW	V2,V16,V2
	BC	18,0,v2

	LVX	(R4+off48),V3
	LVX	(R3+off48),V17
	VPMSUMW	V3,V17,V3
	BC	18,0,v3

	LVX	(R4+off64),V4
	LVX	(R3+off64),V16
	VPMSUMW	V4,V16,V4
	BC	18,0,v4

	LVX	(R4+off80),V5
	LVX	(R3+off80),V17
	VPMSUMW	V5,V17,V5
	BC	18,0,v5

	LVX	(R4+off96),V6
	LVX	(R3+off96),V16
	VPMSUMW	V6,V16,V6
	BC	18,0,v6

	LVX	(R4+off112),V7
	LVX	(R3+off112),V17
	VPMSUMW	V7,V17,V7
	BC	18,0,v7

	ADD	$128,R3
	ADD	$128,R4

	LVX	(R4),V8
	LVX	(R3),V16
	VPMSUMW	V8,V16,V8
	BC	18,0,v8

	LVX	(R4+off16),V9
	LVX	(R3+off16),V17
	VPMSUMW	V9,V17,V9
	BC	18,0,v9

	LVX	(R4+off32),V10
	LVX	(R3+off32),V16
	VPMSUMW	V10,V16,V10
	BC	18,0,v10

	LVX	(R4+off48),V11
	LVX	(R3+off48),V17
	VPMSUMW	V11,V17,V11
	BC	18,0,v11

	LVX	(R4+off64),V12
	LVX	(R3+off64),V16
	VPMSUMW	V12,V16,V12
	BC	18,0,v12

	LVX	(R4+off80),V13
	LVX	(R3+off80),V17
	VPMSUMW	V13,V17,V13
	BC	18,0,v13

	LVX	(R4+off96),V14
	LVX	(R3+off96),V16
	VPMSUMW	V14,V16,V14
	BC	18,0,v14

	LVX	(R4+off112),V15
	LVX	(R3+off112),V17
	VPMSUMW	V15,V17,V15

	VXOR	V19,V15,V19
v14:	VXOR	V20,V14,V20
v13:	VXOR	V19,V13,V19
v12:	VXOR	V20,V12,V20
v11:	VXOR	V19,V11,V19
v10:	VXOR	V20,V10,V20
v9:	VXOR	V19,V9,V19
v8:	VXOR	V20,V8,V20
v7:	VXOR	V19,V7,V19
v6:	VXOR	V20,V6,V20
v5:	VXOR	V19,V5,V19
v4:	VXOR	V20,V4,V20
v3:	VXOR	V19,V3,V19
v2:	VXOR	V20,V2,V20
v1:	VXOR	V19,V1,V19
v0:	VXOR	V20,V0,V20

	VXOR	V19,V20,V0

	BR	barrett_reduction

zero:
	// This case is the original crc, so just return it
	MOVW    R10,ret+32(FP)
	RET