summaryrefslogtreecommitdiffstats
path: root/src/crypto/md5/md5block_arm.s
blob: 54d02b743a6a2b293b2e5de72d9cc0aedadc4291 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
// ARM version of md5block.go

#include "textflag.h"

// Register definitions
#define Rtable	R0	// Pointer to MD5 constants table
#define Rdata	R1	// Pointer to data to hash
#define Ra	R2	// MD5 accumulator
#define Rb	R3	// MD5 accumulator
#define Rc	R4	// MD5 accumulator
#define Rd	R5	// MD5 accumulator
#define Rc0	R6	// MD5 constant
#define Rc1	R7	// MD5 constant
#define Rc2	R8	// MD5 constant
// r9, r10 are forbidden
// r11 is OK provided you check the assembler that no synthetic instructions use it
#define Rc3	R11	// MD5 constant
#define Rt0	R12	// temporary
#define Rt1	R14	// temporary

// func block(dig *digest, p []byte)
// 0(FP) is *digest
// 4(FP) is p.array (struct Slice)
// 8(FP) is p.len
//12(FP) is p.cap
//
// Stack frame
#define p_end	end-4(SP)	// pointer to the end of data
#define p_data	data-8(SP)	// current data pointer
#define buf	buffer-(8+4*16)(SP)	//16 words temporary buffer
		// 3 words at 4..12(R13) for called routine parameters

TEXT	·block(SB), NOSPLIT, $84-16
	MOVW	p+4(FP), Rdata	// pointer to the data
	MOVW	p_len+8(FP), Rt0	// number of bytes
	ADD	Rdata, Rt0
	MOVW	Rt0, p_end	// pointer to end of data

loop:
	MOVW	Rdata, p_data	// Save Rdata
	AND.S	$3, Rdata, Rt0	// TST $3, Rdata not working see issue 5921
	BEQ	aligned			// aligned detected - skip copy

	// Copy the unaligned source data into the aligned temporary buffer
	// memmove(to=4(R13), from=8(R13), n=12(R13)) - Corrupts all registers
	MOVW	$buf, Rtable	// to
	MOVW	$64, Rc0		// n
	MOVM.IB	[Rtable,Rdata,Rc0], (R13)
	BL	runtime·memmove(SB)

	// Point to the local aligned copy of the data
	MOVW	$buf, Rdata

aligned:
	// Point to the table of constants
	// A PC relative add would be cheaper than this
	MOVW	$·table(SB), Rtable

	// Load up initial MD5 accumulator
	MOVW	dig+0(FP), Rc0
	MOVM.IA (Rc0), [Ra,Rb,Rc,Rd]

// a += (((c^d)&b)^d) + X[index] + const
// a = a<<shift | a>>(32-shift) + b
#define ROUND1(Ra, Rb, Rc, Rd, index, shift, Rconst) \
	EOR	Rc, Rd, Rt0		; \
	AND	Rb, Rt0			; \
	EOR	Rd, Rt0			; \
	MOVW	(index<<2)(Rdata), Rt1	; \
	ADD	Rt1, Rt0			; \
	ADD	Rconst, Rt0			; \
	ADD	Rt0, Ra			; \
	ADD	Ra@>(32-shift), Rb, Ra	;

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND1(Ra, Rb, Rc, Rd,  0,	7, Rc0)
	ROUND1(Rd, Ra, Rb, Rc,  1, 12, Rc1)
	ROUND1(Rc, Rd, Ra, Rb,  2, 17, Rc2)
	ROUND1(Rb, Rc, Rd, Ra,  3, 22, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND1(Ra, Rb, Rc, Rd,  4,	7, Rc0)
	ROUND1(Rd, Ra, Rb, Rc,  5, 12, Rc1)
	ROUND1(Rc, Rd, Ra, Rb,  6, 17, Rc2)
	ROUND1(Rb, Rc, Rd, Ra,  7, 22, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND1(Ra, Rb, Rc, Rd,  8,	7, Rc0)
	ROUND1(Rd, Ra, Rb, Rc,  9, 12, Rc1)
	ROUND1(Rc, Rd, Ra, Rb, 10, 17, Rc2)
	ROUND1(Rb, Rc, Rd, Ra, 11, 22, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND1(Ra, Rb, Rc, Rd, 12,	7, Rc0)
	ROUND1(Rd, Ra, Rb, Rc, 13, 12, Rc1)
	ROUND1(Rc, Rd, Ra, Rb, 14, 17, Rc2)
	ROUND1(Rb, Rc, Rd, Ra, 15, 22, Rc3)

// a += (((b^c)&d)^c) + X[index] + const
// a = a<<shift | a>>(32-shift) + b
#define ROUND2(Ra, Rb, Rc, Rd, index, shift, Rconst) \
	EOR	Rb, Rc, Rt0		; \
	AND	Rd, Rt0			; \
	EOR	Rc, Rt0			; \
	MOVW	(index<<2)(Rdata), Rt1	; \
	ADD	Rt1, Rt0			; \
	ADD	Rconst, Rt0			; \
	ADD	Rt0, Ra			; \
	ADD	Ra@>(32-shift), Rb, Ra	;

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND2(Ra, Rb, Rc, Rd,  1,	5, Rc0)
	ROUND2(Rd, Ra, Rb, Rc,  6,	9, Rc1)
	ROUND2(Rc, Rd, Ra, Rb, 11, 14, Rc2)
	ROUND2(Rb, Rc, Rd, Ra,  0, 20, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND2(Ra, Rb, Rc, Rd,  5,	5, Rc0)
	ROUND2(Rd, Ra, Rb, Rc, 10,	9, Rc1)
	ROUND2(Rc, Rd, Ra, Rb, 15, 14, Rc2)
	ROUND2(Rb, Rc, Rd, Ra,  4, 20, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND2(Ra, Rb, Rc, Rd,  9,	5, Rc0)
	ROUND2(Rd, Ra, Rb, Rc, 14,	9, Rc1)
	ROUND2(Rc, Rd, Ra, Rb,  3, 14, Rc2)
	ROUND2(Rb, Rc, Rd, Ra,  8, 20, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND2(Ra, Rb, Rc, Rd, 13,	5, Rc0)
	ROUND2(Rd, Ra, Rb, Rc,  2,	9, Rc1)
	ROUND2(Rc, Rd, Ra, Rb,  7, 14, Rc2)
	ROUND2(Rb, Rc, Rd, Ra, 12, 20, Rc3)

// a += (b^c^d) + X[index] + const
// a = a<<shift | a>>(32-shift) + b
#define ROUND3(Ra, Rb, Rc, Rd, index, shift, Rconst) \
	EOR	Rb, Rc, Rt0		; \
	EOR	Rd, Rt0			; \
	MOVW	(index<<2)(Rdata), Rt1	; \
	ADD	Rt1, Rt0			; \
	ADD	Rconst, Rt0			; \
	ADD	Rt0, Ra			; \
	ADD	Ra@>(32-shift), Rb, Ra	;

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND3(Ra, Rb, Rc, Rd,  5,	4, Rc0)
	ROUND3(Rd, Ra, Rb, Rc,  8, 11, Rc1)
	ROUND3(Rc, Rd, Ra, Rb, 11, 16, Rc2)
	ROUND3(Rb, Rc, Rd, Ra, 14, 23, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND3(Ra, Rb, Rc, Rd,  1,	4, Rc0)
	ROUND3(Rd, Ra, Rb, Rc,  4, 11, Rc1)
	ROUND3(Rc, Rd, Ra, Rb,  7, 16, Rc2)
	ROUND3(Rb, Rc, Rd, Ra, 10, 23, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND3(Ra, Rb, Rc, Rd, 13,	4, Rc0)
	ROUND3(Rd, Ra, Rb, Rc,  0, 11, Rc1)
	ROUND3(Rc, Rd, Ra, Rb,  3, 16, Rc2)
	ROUND3(Rb, Rc, Rd, Ra,  6, 23, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND3(Ra, Rb, Rc, Rd,  9,	4, Rc0)
	ROUND3(Rd, Ra, Rb, Rc, 12, 11, Rc1)
	ROUND3(Rc, Rd, Ra, Rb, 15, 16, Rc2)
	ROUND3(Rb, Rc, Rd, Ra,  2, 23, Rc3)

// a += (c^(b|^d)) + X[index] + const
// a = a<<shift | a>>(32-shift) + b
#define ROUND4(Ra, Rb, Rc, Rd, index, shift, Rconst) \
	MVN	Rd, Rt0			; \
	ORR	Rb, Rt0			; \
	EOR	Rc, Rt0			; \
	MOVW	(index<<2)(Rdata), Rt1	; \
	ADD	Rt1, Rt0			; \
	ADD	Rconst, Rt0			; \
	ADD	Rt0, Ra			; \
	ADD	Ra@>(32-shift), Rb, Ra	;

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND4(Ra, Rb, Rc, Rd,  0,	6, Rc0)
	ROUND4(Rd, Ra, Rb, Rc,  7, 10, Rc1)
	ROUND4(Rc, Rd, Ra, Rb, 14, 15, Rc2)
	ROUND4(Rb, Rc, Rd, Ra,  5, 21, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND4(Ra, Rb, Rc, Rd, 12,	6, Rc0)
	ROUND4(Rd, Ra, Rb, Rc,  3, 10, Rc1)
	ROUND4(Rc, Rd, Ra, Rb, 10, 15, Rc2)
	ROUND4(Rb, Rc, Rd, Ra,  1, 21, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND4(Ra, Rb, Rc, Rd,  8,	6, Rc0)
	ROUND4(Rd, Ra, Rb, Rc, 15, 10, Rc1)
	ROUND4(Rc, Rd, Ra, Rb,  6, 15, Rc2)
	ROUND4(Rb, Rc, Rd, Ra, 13, 21, Rc3)

	MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3]
	ROUND4(Ra, Rb, Rc, Rd,  4,	6, Rc0)
	ROUND4(Rd, Ra, Rb, Rc, 11, 10, Rc1)
	ROUND4(Rc, Rd, Ra, Rb,  2, 15, Rc2)
	ROUND4(Rb, Rc, Rd, Ra,  9, 21, Rc3)

	MOVW	dig+0(FP), Rt0
	MOVM.IA (Rt0), [Rc0,Rc1,Rc2,Rc3]

	ADD	Rc0, Ra
	ADD	Rc1, Rb
	ADD	Rc2, Rc
	ADD	Rc3, Rd

	MOVM.IA [Ra,Rb,Rc,Rd], (Rt0)

	MOVW	p_data, Rdata
	MOVW	p_end, Rt0
	ADD	$64, Rdata
	CMP	Rt0, Rdata
	BLO	loop

	RET

// MD5 constants table

	// Round 1
	DATA	·table+0x00(SB)/4, $0xd76aa478
	DATA	·table+0x04(SB)/4, $0xe8c7b756
	DATA	·table+0x08(SB)/4, $0x242070db
	DATA	·table+0x0c(SB)/4, $0xc1bdceee
	DATA	·table+0x10(SB)/4, $0xf57c0faf
	DATA	·table+0x14(SB)/4, $0x4787c62a
	DATA	·table+0x18(SB)/4, $0xa8304613
	DATA	·table+0x1c(SB)/4, $0xfd469501
	DATA	·table+0x20(SB)/4, $0x698098d8
	DATA	·table+0x24(SB)/4, $0x8b44f7af
	DATA	·table+0x28(SB)/4, $0xffff5bb1
	DATA	·table+0x2c(SB)/4, $0x895cd7be
	DATA	·table+0x30(SB)/4, $0x6b901122
	DATA	·table+0x34(SB)/4, $0xfd987193
	DATA	·table+0x38(SB)/4, $0xa679438e
	DATA	·table+0x3c(SB)/4, $0x49b40821
	// Round 2
	DATA	·table+0x40(SB)/4, $0xf61e2562
	DATA	·table+0x44(SB)/4, $0xc040b340
	DATA	·table+0x48(SB)/4, $0x265e5a51
	DATA	·table+0x4c(SB)/4, $0xe9b6c7aa
	DATA	·table+0x50(SB)/4, $0xd62f105d
	DATA	·table+0x54(SB)/4, $0x02441453
	DATA	·table+0x58(SB)/4, $0xd8a1e681
	DATA	·table+0x5c(SB)/4, $0xe7d3fbc8
	DATA	·table+0x60(SB)/4, $0x21e1cde6
	DATA	·table+0x64(SB)/4, $0xc33707d6
	DATA	·table+0x68(SB)/4, $0xf4d50d87
	DATA	·table+0x6c(SB)/4, $0x455a14ed
	DATA	·table+0x70(SB)/4, $0xa9e3e905
	DATA	·table+0x74(SB)/4, $0xfcefa3f8
	DATA	·table+0x78(SB)/4, $0x676f02d9
	DATA	·table+0x7c(SB)/4, $0x8d2a4c8a
	// Round 3
	DATA	·table+0x80(SB)/4, $0xfffa3942
	DATA	·table+0x84(SB)/4, $0x8771f681
	DATA	·table+0x88(SB)/4, $0x6d9d6122
	DATA	·table+0x8c(SB)/4, $0xfde5380c
	DATA	·table+0x90(SB)/4, $0xa4beea44
	DATA	·table+0x94(SB)/4, $0x4bdecfa9
	DATA	·table+0x98(SB)/4, $0xf6bb4b60
	DATA	·table+0x9c(SB)/4, $0xbebfbc70
	DATA	·table+0xa0(SB)/4, $0x289b7ec6
	DATA	·table+0xa4(SB)/4, $0xeaa127fa
	DATA	·table+0xa8(SB)/4, $0xd4ef3085
	DATA	·table+0xac(SB)/4, $0x04881d05
	DATA	·table+0xb0(SB)/4, $0xd9d4d039
	DATA	·table+0xb4(SB)/4, $0xe6db99e5
	DATA	·table+0xb8(SB)/4, $0x1fa27cf8
	DATA	·table+0xbc(SB)/4, $0xc4ac5665
	// Round 4
	DATA	·table+0xc0(SB)/4, $0xf4292244
	DATA	·table+0xc4(SB)/4, $0x432aff97
	DATA	·table+0xc8(SB)/4, $0xab9423a7
	DATA	·table+0xcc(SB)/4, $0xfc93a039
	DATA	·table+0xd0(SB)/4, $0x655b59c3
	DATA	·table+0xd4(SB)/4, $0x8f0ccc92
	DATA	·table+0xd8(SB)/4, $0xffeff47d
	DATA	·table+0xdc(SB)/4, $0x85845dd1
	DATA	·table+0xe0(SB)/4, $0x6fa87e4f
	DATA	·table+0xe4(SB)/4, $0xfe2ce6e0
	DATA	·table+0xe8(SB)/4, $0xa3014314
	DATA	·table+0xec(SB)/4, $0x4e0811a1
	DATA	·table+0xf0(SB)/4, $0xf7537e82
	DATA	·table+0xf4(SB)/4, $0xbd3af235
	DATA	·table+0xf8(SB)/4, $0x2ad7d2bb
	DATA	·table+0xfc(SB)/4, $0xeb86d391
	// Global definition
	GLOBL	·table(SB),8,$256