summaryrefslogtreecommitdiffstats
path: root/security/nss/lib/freebl/arcfour-amd64-masm.asm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--security/nss/lib/freebl/arcfour-amd64-masm.asm107
1 files changed, 107 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/arcfour-amd64-masm.asm b/security/nss/lib/freebl/arcfour-amd64-masm.asm
new file mode 100644
index 0000000000..1601c4f899
--- /dev/null
+++ b/security/nss/lib/freebl/arcfour-amd64-masm.asm
@@ -0,0 +1,107 @@
+; This Source Code Form is subject to the terms of the Mozilla Public
+; License, v. 2.0. If a copy of the MPL was not distributed with this
+; file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+; ** ARCFOUR implementation optimized for AMD64.
+; **
+; ** The throughput achieved by this code is about 320 MBytes/sec, on
+; ** a 1.8 GHz AMD Opteron (rev C0) processor.
+
+.CODE
+
+; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen,
+; const unsigned char *input, unsigned char *output);
+
+
+ARCFOUR PROC
+
+ push rbp
+ push rbx
+ push rsi
+ push rdi
+
+ mov rbp, rcx ; key = ARG(key)
+ mov rbx, rdx ; rbx = ARG(len)
+ mov rsi, r8 ; in = ARG(in)
+ mov rdi, r9 ; out = ARG(out)
+ mov rcx, [rbp] ; x = key->x
+ mov rdx, [rbp+8] ; y = key->y
+ add rbp, 16 ; d = key->data
+ inc rcx ; x++
+ and rcx, 0ffh ; x &= 0xff
+ lea rbx, [rbx+rsi-8] ; rbx = in+len-8
+ mov r9, rbx ; tmp = in+len-8
+ mov rax, [rbp+rcx*8] ; tx = d[x]
+ cmp rbx, rsi ; cmp in with in+len-8
+ jl Lend ; jump if (in+len-8 < in)
+
+Lstart:
+ add rsi, 8 ; increment in
+ add rdi, 8 ; increment out
+
+ ;
+ ; generate the next 8 bytes of the rc4 stream into r8
+ ;
+
+ mov r11, 8 ; byte counter
+
+@@:
+ add dl, al ; y += tx
+ mov ebx, [rbp+rdx*8] ; ty = d[y]
+ mov [rbp+rcx*8], ebx ; d[x] = ty
+ add bl, al ; val = ty + tx
+ mov [rbp+rdx*8], eax ; d[y] = tx
+ inc cl ; x++ (NEXT ROUND)
+ mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND)
+ mov r8b, [rbp+rbx*8] ; val = d[val]
+ dec r11b
+ ror r8, 8 ; (ror does not change ZF)
+ jnz @b
+
+ ;
+ ; xor 8 bytes
+ ;
+
+ xor r8, [rsi-8]
+ cmp rsi, r9 ; cmp in+len-8 with in
+ mov [rdi-8], r8
+ jle Lstart
+
+Lend:
+ add r9, 8 ; tmp = in+len
+
+ ;
+ ; handle the last bytes, one by one
+ ;
+
+@@:
+ cmp r9, rsi ; cmp in with in+len
+ jle Lfinished ; jump if (in+len <= in)
+ add dl, al ; y += tx
+ mov ebx, [rbp+rdx*8] ; ty = d[y]
+ mov [rbp+rcx*8], ebx ; d[x] = ty
+ add bl, al ; val = ty + tx
+ mov [rbp+rdx*8], eax ; d[y] = tx
+ inc cl ; x++ (NEXT ROUND)
+ mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND)
+ mov r8b, [rbp+rbx*8] ; val = d[val]
+ xor r8b, [rsi] ; xor 1 byte
+ mov [rdi], r8b
+ inc rsi ; in++
+ inc rdi
+ jmp @b
+
+Lfinished:
+ dec rcx ; x--
+ mov [rbp-8], dl ; key->y = y
+ mov [rbp-16], cl ; key->x = x
+
+ pop rdi
+ pop rsi
+ pop rbx
+ pop rbp
+ ret
+
+ARCFOUR ENDP
+
+END