1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
/*
* Use the fixed point version of Barrett reduction to compute a mod n
* over GF(2) for n = 0x104c11db7 using POWER8 instructions. We use k = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of either:
*
* a) the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version, or
* b) the Apache License, Version 2.0
*/
#if defined (__clang__)
#ifndef __ALTIVEC__
#define __ALTIVEC__
#endif
#include "ppc-asm.h"
#else
#include <ppc-asm.h>
#endif
#include "ppc-opcode.h"
.section .data
.balign 16
.constants:
/* Barrett constant m - (4^32)/n */
.octa 0x00000000000000000000000104d101df
/* Barrett constant n */
.octa 0x00000000000000000000000104c11db7
.bit_reflected_constants:
/* 33 bit reflected Barrett constant m - (4^32)/n */
.octa 0x000000000000000000000001f7011641
/* 33 bit reflected Barrett constant n */
.octa 0x000000000000000000000001db710641
.text
/* unsigned int barrett_reduction(unsigned long val) */
FUNC_START(barrett_reduction)
lis r4,.constants@ha
la r4,.constants@l(r4)
li r5,16
vxor v1,v1,v1 /* zero v1 */
/* Get a into v0 */
MTVRD(v0, r3)
vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */
/* Load constants */
lvx v2,0,r4 /* m */
lvx v3,r5,r4 /* n */
/*
* Now for the actual algorithm. The idea is to calculate q,
* the multiple of our polynomial that we need to subtract. By
* doing the computation 2x bits higher (ie 64 bits) and shifting the
* result back down 2x bits, we round down to the nearest multiple.
*/
VPMSUMD(v4,v0,v2) /* ma */
vsldoi v4,v1,v4,8 /* q = floor(ma/(2^64)) */
VPMSUMD(v4,v4,v3) /* qn */
vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */
/*
* Get the result into r3. We need to shift it left 8 bytes:
* V0 [ 0 1 2 X ]
* V0 [ 0 X 2 3 ]
*/
vsldoi v0,v0,v1,8 /* shift result into top 64 bits of v0 */
MFVRD(r3, v0)
blr
FUNC_END(barrett_reduction)
/* unsigned int barrett_reduction_reflected(unsigned long val) */
FUNC_START(barrett_reduction_reflected)
lis r4,.bit_reflected_constants@ha
la r4,.bit_reflected_constants@l(r4)
li r5,16
vxor v1,v1,v1 /* zero v1 */
/* Get a into v0 */
MTVRD(v0, r3)
vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */
/* Load constants */
lvx v2,0,r4 /* m */
lvx v3,r5,r4 /* n */
vspltisw v5,-1 /* all ones */
vsldoi v6,v1,v5,4 /* bitmask with low 32 bits set */
/*
* Now for the Barrett reduction algorithm. Instead of bit reflecting
* our data (which is expensive to do), we bit reflect our constants
* and our algorithm, which means the intermediate data in our vector
* registers goes from 0-63 instead of 63-0. We can reflect the
* algorithm because we don't carry in mod 2 arithmetic.
*/
vand v4,v0,v6 /* bottom 32 bits of a */
VPMSUMD(v4,v4,v2) /* ma */
vand v4,v4,v6 /* bottom 32bits of ma */
VPMSUMD(v4,v4,v3) /* qn */
vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in the
* high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
vsldoi v0,v0,v1,4 /* shift result into top 64 bits of v0 */
MFVRD(r3, v0)
blr
FUNC_END(barrett_reduction_reflected)
|