diff options
Diffstat (limited to 'security/nss/lib/freebl/arcfour-amd64-masm.asm')
-rw-r--r-- | security/nss/lib/freebl/arcfour-amd64-masm.asm | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/arcfour-amd64-masm.asm b/security/nss/lib/freebl/arcfour-amd64-masm.asm new file mode 100644 index 0000000000..1601c4f899 --- /dev/null +++ b/security/nss/lib/freebl/arcfour-amd64-masm.asm @@ -0,0 +1,107 @@ +; This Source Code Form is subject to the terms of the Mozilla Public +; License, v. 2.0. If a copy of the MPL was not distributed with this +; file, You can obtain one at http://mozilla.org/MPL/2.0/. + +; ** ARCFOUR implementation optimized for AMD64. +; ** +; ** The throughput achieved by this code is about 320 MBytes/sec, on +; ** a 1.8 GHz AMD Opteron (rev C0) processor. + +.CODE + +; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen, +; const unsigned char *input, unsigned char *output); + + +ARCFOUR PROC + + push rbp + push rbx + push rsi + push rdi + + mov rbp, rcx ; key = ARG(key) + mov rbx, rdx ; rbx = ARG(len) + mov rsi, r8 ; in = ARG(in) + mov rdi, r9 ; out = ARG(out) + mov rcx, [rbp] ; x = key->x + mov rdx, [rbp+8] ; y = key->y + add rbp, 16 ; d = key->data + inc rcx ; x++ + and rcx, 0ffh ; x &= 0xff + lea rbx, [rbx+rsi-8] ; rbx = in+len-8 + mov r9, rbx ; tmp = in+len-8 + mov rax, [rbp+rcx*8] ; tx = d[x] + cmp rbx, rsi ; cmp in with in+len-8 + jl Lend ; jump if (in+len-8 < in) + +Lstart: + add rsi, 8 ; increment in + add rdi, 8 ; increment out + + ; + ; generate the next 8 bytes of the rc4 stream into r8 + ; + + mov r11, 8 ; byte counter + +@@: + add dl, al ; y += tx + mov ebx, [rbp+rdx*8] ; ty = d[y] + mov [rbp+rcx*8], ebx ; d[x] = ty + add bl, al ; val = ty + tx + mov [rbp+rdx*8], eax ; d[y] = tx + inc cl ; x++ (NEXT ROUND) + mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) + mov r8b, [rbp+rbx*8] ; val = d[val] + dec r11b + ror r8, 8 ; (ror does not change ZF) + jnz @b + + ; + ; xor 8 bytes + ; + + xor r8, [rsi-8] + cmp rsi, r9 ; cmp in+len-8 with in + mov [rdi-8], r8 + jle Lstart + +Lend: + add r9, 8 ; tmp = in+len + + ; + ; handle the last bytes, one by one + ; + +@@: + cmp r9, rsi ; cmp in with in+len + jle Lfinished ; jump if (in+len <= in) + add dl, al ; y += tx + mov ebx, [rbp+rdx*8] ; ty = d[y] + mov [rbp+rcx*8], ebx ; d[x] = ty + add bl, al ; val = ty + tx + mov [rbp+rdx*8], eax ; d[y] = tx + inc cl ; x++ (NEXT ROUND) + mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) + mov r8b, [rbp+rbx*8] ; val = d[val] + xor r8b, [rsi] ; xor 1 byte + mov [rdi], r8b + inc rsi ; in++ + inc rdi + jmp @b + +Lfinished: + dec rcx ; x-- + mov [rbp-8], dl ; key->y = y + mov [rbp-16], cl ; key->x = x + + pop rdi + pop rsi + pop rbx + pop rbp + ret + +ARCFOUR ENDP + +END |