1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# ** ARCFOUR implementation optimized for AMD64.
# **
# ** The throughput achieved by this code is about 320 MBytes/sec, on
# ** a 1.8 GHz AMD Opteron (rev C0) processor.
.text
.align 16
.globl ARCFOUR
.type ARCFOUR,@function
ARCFOUR:
pushq %rbp
pushq %rbx
movq %rdi, %rbp # key = ARG(key)
movq %rsi, %rbx # rbx = ARG(len)
movq %rdx, %rsi # in = ARG(in)
movq %rcx, %rdi # out = ARG(out)
movq (%rbp), %rcx # x = key->x
movq 8(%rbp), %rdx # y = key->y
addq $16, %rbp # d = key->data
incq %rcx # x++
andq $255, %rcx # x &= 0xff
leaq -8(%rbx,%rsi), %rbx # rbx = in+len-8
movq %rbx, %r9 # tmp = in+len-8
movq 0(%rbp,%rcx,8), %rax # tx = d[x]
cmpq %rsi, %rbx # cmp in with in+len-8
jl .Lend # jump if (in+len-8 < in)
.Lstart:
addq $8, %rsi # increment in
addq $8, %rdi # increment out
# generate the next 8 bytes of the rc4 stream into %r8
movq $8, %r11 # byte counter
1: addb %al, %dl # y += tx
movl 0(%rbp,%rdx,8), %ebx # ty = d[y]
movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty
addb %al, %bl # val = ty + tx
movl %eax, 0(%rbp,%rdx,8) # d[y] = tx
incb %cl # x++ (NEXT ROUND)
movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND)
movb 0(%rbp,%rbx,8), %r8b # val = d[val]
decb %r11b
rorq $8, %r8 # (ror does not change ZF)
jnz 1b
# xor 8 bytes
xorq -8(%rsi), %r8
cmpq %r9, %rsi # cmp in+len-8 with in
movq %r8, -8(%rdi)
jle .Lstart # jump if (in <= in+len-8)
.Lend:
addq $8, %r9 # tmp = in+len
# handle the last bytes, one by one
1: cmpq %rsi, %r9 # cmp in with in+len
jle .Lfinished # jump if (in+len <= in)
addb %al, %dl # y += tx
movl 0(%rbp,%rdx,8), %ebx # ty = d[y]
movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty
addb %al, %bl # val = ty + tx
movl %eax, 0(%rbp,%rdx,8) # d[y] = tx
incb %cl # x++ (NEXT ROUND)
movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND)
movb 0(%rbp,%rbx,8), %r8b # val = d[val]
xorb (%rsi), %r8b # xor 1 byte
movb %r8b, (%rdi)
incq %rsi # in++
incq %rdi # out++
jmp 1b
.Lfinished:
decq %rcx # x--
movb %dl, -8(%rbp) # key->y = y
movb %cl, -16(%rbp) # key->x = x
popq %rbx
popq %rbp
ret
.L_ARCFOUR_end:
.size ARCFOUR,.L_ARCFOUR_end-ARCFOUR
# Magic indicating no need for an executable stack
.section .note.GNU-stack,"",@progbits
.previous
|