63 files changed, 23342 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm
new file mode 100644
index 000000000..7c57688ff
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm
@@ -0,0 +1,532 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_128_sse(void    *in,
+;                          UINT128 *IV,
+;                          UINT128  keys[11],
+;                          void    *out,
+;                          UINT64   len_bytes);
+;
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+;
+%include "include/os.asm"
+
+%ifndef AES_CBC_DEC_128
+%define AES_CBC_DEC_128 aes_cbc_dec_128_sse
+%endif
+
+%define MOVDQ	movdqu
+
+%ifdef LINUX
+%define IN		rdi
+%define IV		rsi
+%define KEYS		rdx
+%define OUT		rcx
+%define LEN		r8
+%else
+%define IN		rcx
+%define IV		rdx
+%define KEYS		r8
+%define OUT		r9
+%define LEN		r10
+%endif
+
+%define IDX		rax
+%define TMP		IDX
+%define XDATA0		xmm0
+%define XDATA1		xmm1
+%define XDATA2		xmm2
+%define XDATA3		xmm3
+%define XKEY0		xmm4
+%define XKEY2		xmm5
+%define XKEY4		xmm6
+%define XKEY6		xmm7
+%define XKEY8		xmm8
+%define XKEY10		xmm9
+%define XIV		xmm10
+%define XSAVED0		xmm11
+%define XSAVED1		xmm12
+%define XSAVED2		xmm13
+%define XSAVED3		xmm14
+%define XKEY		xmm15
+
+%define IV_TMP		XSAVED3
+
+section .text
+
+MKGLOBAL(AES_CBC_DEC_128,function,internal)
+AES_CBC_DEC_128:
+%ifndef LINUX
+	mov	LEN, [rsp + 8*5]
+%endif
+
+	mov	TMP, LEN
+	and	TMP, 3*16
+	jz	initial_4
+	cmp	TMP, 2*16
+	jb	initial_1
+	ja	initial_3
+
+initial_2:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XIV,     XDATA1
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, [KEYS + 1*16]	; 1. DEC
+	aesdec	XDATA1, [KEYS + 1*16]
+
+	mov	IDX, 2*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, [KEYS + 3*16]	; 3. DEC
+	aesdec	XDATA1, [KEYS + 3*16]
+
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, [KEYS + 5*16]	; 5. DEC
+	aesdec	XDATA1, [KEYS + 5*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+
+	movdqa	XKEY8, [KEYS + 8*16]
+
+	aesdec	XDATA0, [KEYS + 7*16]	; 7. DEC
+	aesdec	XDATA1, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY8		; 8. DEC
+	aesdec	XDATA1, XKEY8
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, [KEYS + 9*16]	; 9. DEC
+	aesdec	XDATA1, [KEYS + 9*16]
+
+	aesdeclast	XDATA0, XKEY10		; 10. DEC
+	aesdeclast	XDATA1, XKEY10
+
+	pxor	XDATA0, IV_TMP
+	pxor	XDATA1, XSAVED0
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+
+	cmp	LEN, 2*16
+	je	done
+	jmp	main_loop
+
+
+	align 16
+initial_1:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XIV,     XDATA0
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, [KEYS + 1*16]	; 1. DEC
+
+	mov	IDX, 1*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, [KEYS + 3*16]	; 3. DEC
+
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, [KEYS + 5*16]	; 5. DEC
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+
+	movdqa	XKEY8, [KEYS + 8*16]
+
+	aesdec	XDATA0, [KEYS + 7*16]	; 7. DEC
+
+	aesdec	XDATA0, XKEY8		; 8. DEC
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, [KEYS + 9*16]	; 9. DEC
+
+	aesdeclast	XDATA0, XKEY10		; 10. DEC
+
+	pxor	XDATA0, IV_TMP
+
+	movdqu	[OUT + 0*16], XDATA0
+
+	cmp	LEN, 1*16
+	je	done
+	jmp	main_loop
+
+
+initial_3:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+	movdqu	XDATA2, [IN + 2*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XSAVED1, XDATA1
+	movdqa	XIV,     XDATA2
+
+	movdqa	XKEY, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, XKEY		; 1. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+
+	movdqa	XKEY, [KEYS + 3*16]
+
+	mov	IDX, 3*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+	aesdec	XDATA2, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, XKEY		; 3. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+
+	movdqa	XKEY, [KEYS + 5*16]
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+	aesdec	XDATA2, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, XKEY		; 5. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+
+	movdqa	XKEY, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+	aesdec	XDATA2, XKEY6
+
+	movdqa	XKEY8, [KEYS + 8*16]
+
+	aesdec	XDATA0, XKEY		; 7. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+
+	movdqa	XKEY, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY8		; 8. DEC
+	aesdec	XDATA1, XKEY8
+	aesdec	XDATA2, XKEY8
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, XKEY		; 9. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+
+	aesdeclast	XDATA0, XKEY10	; 10. DEC
+	aesdeclast	XDATA1, XKEY10
+	aesdeclast	XDATA2, XKEY10
+
+	pxor	XDATA0, IV_TMP
+	pxor	XDATA1, XSAVED0
+	pxor	XDATA2, XSAVED1
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+	movdqu	[OUT + 2*16], XDATA2
+
+	cmp	LEN, 3*16
+	je	done
+	jmp	main_loop
+
+
+	align 16
+initial_4:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+	movdqu	XDATA2, [IN + 2*16]
+	movdqu	XDATA3, [IN + 3*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XSAVED1, XDATA1
+	movdqa	XSAVED2, XDATA2
+	movdqa	XIV,     XDATA3
+
+	movdqa	XKEY, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+	pxor	XDATA3, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, XKEY		; 1. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	movdqa	XKEY, [KEYS + 3*16]
+
+	mov	IDX, 4*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+	aesdec	XDATA2, XKEY2
+	aesdec	XDATA3, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, XKEY		; 3. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	movdqa	XKEY, [KEYS + 5*16]
+
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+	aesdec	XDATA2, XKEY4
+	aesdec	XDATA3, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, XKEY		; 5. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	movdqa	XKEY, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+	aesdec	XDATA2, XKEY6
+	aesdec	XDATA3, XKEY6
+
+	movdqa	XKEY8, [KEYS + 8*16]
+
+	aesdec	XDATA0, XKEY		; 7. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	movdqa	XKEY, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY8		; 8. DEC
+	aesdec	XDATA1, XKEY8
+	aesdec	XDATA2, XKEY8
+	aesdec	XDATA3, XKEY8
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, XKEY		; 9. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	aesdeclast	XDATA0, XKEY10	; 10. DEC
+	aesdeclast	XDATA1, XKEY10
+	aesdeclast	XDATA2, XKEY10
+	aesdeclast	XDATA3, XKEY10
+
+	pxor	XDATA0, IV_TMP
+	pxor	XDATA1, XSAVED0
+	pxor	XDATA2, XSAVED1
+	pxor	XDATA3, XSAVED2
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+	movdqu	[OUT + 2*16], XDATA2
+	movdqu	[OUT + 3*16], XDATA3
+
+	cmp	LEN, 4*16
+	jz	done
+	jmp	main_loop
+
+	align 16
+main_loop:
+	; load cipher text
+	movdqu	XDATA0, [IN + IDX + 0*16]
+	movdqu	XDATA1, [IN + IDX + 1*16]
+	movdqu	XDATA2, [IN + IDX + 2*16]
+	movdqu	XDATA3, [IN + IDX + 3*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XSAVED1, XDATA1
+	movdqa	XSAVED2, XDATA2
+	movdqa	XSAVED3, XDATA3
+
+	movdqa	XKEY, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+	pxor	XDATA3, XKEY0
+
+	add	IDX, 4*16
+
+	aesdec	XDATA0, XKEY		; 1. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	movdqa	XKEY, [KEYS + 3*16]
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+	aesdec	XDATA2, XKEY2
+	aesdec	XDATA3, XKEY2
+
+	aesdec	XDATA0, XKEY		; 3. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	movdqa	XKEY, [KEYS + 5*16]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+	aesdec	XDATA2, XKEY4
+	aesdec	XDATA3, XKEY4
+
+	aesdec	XDATA0, XKEY		; 5. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	movdqa	XKEY, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+	aesdec	XDATA2, XKEY6
+	aesdec	XDATA3, XKEY6
+
+	aesdec	XDATA0, XKEY		; 7. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	movdqa	XKEY, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY8		; 8. DEC
+	aesdec	XDATA1, XKEY8
+	aesdec	XDATA2, XKEY8
+	aesdec	XDATA3, XKEY8
+
+	aesdec	XDATA0, XKEY		; 9. DEC
+	aesdec	XDATA1, XKEY
+	aesdec	XDATA2, XKEY
+	aesdec	XDATA3, XKEY
+
+	aesdeclast	XDATA0, XKEY10		; 10. DEC
+	aesdeclast	XDATA1, XKEY10
+	aesdeclast	XDATA2, XKEY10
+	aesdeclast	XDATA3, XKEY10
+
+	pxor	XDATA0, XIV
+	pxor	XDATA1, XSAVED0
+	pxor	XDATA2, XSAVED1
+	pxor	XDATA3, XSAVED2
+
+	movdqu	[OUT + IDX + 0*16 - 4*16], XDATA0
+	movdqu	[OUT + IDX + 1*16 - 4*16], XDATA1
+	movdqu	[OUT + IDX + 2*16 - 4*16], XDATA2
+	movdqu	[OUT + IDX + 3*16 - 4*16], XDATA3
+
+	movdqa	XIV, XSAVED3
+
+	CMP	IDX, LEN
+	jne	main_loop
+
+done:
+; Don't write back IV
+;	movdqu	[IV], XIV
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm
new file mode 100644
index 000000000..72e19f482
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; Routine to compute CBC-MAC based on 128 bit CBC AES encryptionk code
+
+%define CBC_MAC
+%include "sse/aes_cbc_enc_128_x4.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm
new file mode 100644
index 000000000..11356afae
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm
@@ -0,0 +1,545 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+%include "include/reg_sizes.asm"
+
+; routine to do AES128 CNTR enc/decrypt "by4"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+%ifndef AES_CNTR_128
+%define AES_CNTR_128 aes_cntr_128_sse
+%define AES_CNTR_BIT_128 aes_cntr_bit_128_sse
+%endif
+
+extern byteswap_const, set_byte15, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+
+%define CONCAT(a,b) a %+ b
+%define MOVDQ movdqu
+
+%define xdata0	xmm0
+%define xdata1	xmm1
+%define xpart	xmm1
+%define xdata2	xmm2
+%define xdata3	xmm3
+%define xdata4	xmm4
+%define xdata5	xmm5
+%define xdata6	xmm6
+%define xdata7	xmm7
+%define xcounter xmm8
+%define xtmp    xmm8
+%define xbyteswap xmm9
+%define xtmp2   xmm9
+%define xkey0 	xmm10
+%define xtmp3   xmm10
+%define xkey3 	xmm11
+%define xkey6 	xmm12
+%define xkey9	xmm13
+%define xkeyA	xmm14
+%define xkeyB	xmm15
+
+%ifdef CNTR_CCM_SSE
+%ifdef LINUX
+%define job	  rdi
+%define p_in	  rsi
+%define p_keys	  rdx
+%define p_out	  rcx
+%define num_bytes r8
+%define p_ivlen   r9
+%else ;; LINUX
+%define job	  rcx
+%define p_in	  rdx
+%define p_keys	  r8
+%define p_out	  r9
+%define num_bytes r10
+%define p_ivlen   rax
+%endif ;; LINUX
+%define p_IV    r11
+%else ;; CNTR_CCM_SSE
+%ifdef LINUX
+%define p_in	  rdi
+%define p_IV	  rsi
+%define p_keys	  rdx
+%define p_out	  rcx
+%define num_bytes r8
+%define num_bits  r8
+%define p_ivlen   r9
+%else ;; LINUX
+%define p_in	  rcx
+%define p_IV	  rdx
+%define p_keys	  r8
+%define p_out	  r9
+%define num_bytes r10
+%define num_bits  r10
+%define p_ivlen   qword [rsp + 8*6]
+%endif ;; LINUX
+%endif ;; CNTR_CCM_SSE
+
+%define tmp	r11
+%define flags   r11
+
+%define r_bits   r12
+%define tmp2    r13
+%define mask    r14
+
+%macro do_aes_load 2
+	do_aes %1, %2, 1
+%endmacro
+
+%macro do_aes_noload 2
+	do_aes %1, %2, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 3
+%define %%by %1
+%define %%cntr_type %2
+%define %%load_keys %3
+
+%if (%%load_keys)
+	movdqa	xkey0, [p_keys + 0*16]
+%endif
+
+	movdqa	xdata0, xcounter
+	pshufb	xdata0, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+	movdqa	CONCAT(xdata,i), xcounter
+	paddd	CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
+	pshufb	CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 1*16]
+
+	pxor	xdata0, xkey0
+%ifidn %%cntr_type, CNTR_BIT
+	paddq	xcounter, [rel CONCAT(ddq_add_,%%by)]
+%else
+	paddd	xcounter, [rel CONCAT(ddq_add_,%%by)]
+%endif
+
+%assign i 1
+%rep (%%by - 1)
+	pxor	CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 1
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey3, [p_keys + 3*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 2
+%assign i (i+1)
+%endrep
+
+	add	p_in, 16*%%by
+
+	movdqa	xkeyB, [p_keys + 4*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey3		; key 3
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 4
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey6, [p_keys + 6*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 5
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey6		; key 6
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 8*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 7
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey9, [p_keys + 9*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 8
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey9		; key 9
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+	aesenclast	CONCAT(xdata,i), xkeyB		; key 10
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
+	MOVDQ	xkeyB, [p_in + j*16 - 16*%%by]
+	pxor	CONCAT(xdata,i), xkeyA
+	pxor	CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
+	pxor	CONCAT(xdata,i), xkeyA
+%endif
+
+%ifidn %%cntr_type, CNTR_BIT
+        ;; check if this is the end of the message
+        mov     tmp, num_bytes
+        and     tmp, ~(%%by*16)
+        jnz     %%skip_preserve
+        ;; Check if there is a partial byte
+        or      r_bits, r_bits
+        jz      %%skip_preserve
+
+%assign idx (%%by - 1)
+        ;; Load output to get last partial byte
+        movdqu         xtmp, [p_out + idx * 16]
+
+        ;; Save RCX in temporary GP register
+        mov             tmp, rcx
+        mov             mask, 0xff
+        mov             cl, BYTE(r_bits)
+        shr             mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+        mov             rcx, tmp
+
+        movq            xtmp2, mask
+        pslldq          xtmp2, 15
+        ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+        ;; in the partial byte
+
+        ;; Clear all the bits that do not need to be preserved from the output
+        pand            xtmp, xtmp2
+
+        ;; Clear all bits from the input that are not to be ciphered
+        pandn	        xtmp2, CONCAT(xdata, idx)
+        por             xtmp2, xtmp
+        movdqa		CONCAT(xdata, idx), xtmp2
+
+%%skip_preserve:
+%endif
+
+%assign i 0
+%rep %%by
+	MOVDQ	[p_out  + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+
+;; Macro performing AES-CTR.
+;;
+%macro DO_CNTR 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM)
+
+%ifidn %%CNTR_TYPE, CCM
+        mov     p_in, [job + _src]
+        add     p_in, [job + _cipher_start_src_offset_in_bytes]
+        mov     p_ivlen, [job + _iv_len_in_bytes]
+        mov	num_bytes, [job + _msg_len_to_cipher_in_bytes]
+        mov     p_keys, [job + _aes_enc_key_expanded]
+        mov     p_out, [job + _dst]
+
+	movdqa	xbyteswap, [rel byteswap_const]
+        ;; Prepare IV ;;
+
+        ;; Byte 0: flags with L'
+        ;; Calculate L' = 15 - Nonce length - 1 = 14 - IV length
+        mov     flags, 14
+        sub     flags, p_ivlen
+        movd    xcounter, DWORD(flags)
+        ;; Bytes 1 - 13: Nonce (7 - 13 bytes long)
+
+        ;; Bytes 1 - 7 are always copied (first 7 bytes)
+        mov     p_IV, [job + _iv]
+        pinsrb	xcounter, [p_IV], 1
+        pinsrw	xcounter, [p_IV + 1], 1
+        pinsrd  xcounter, [p_IV + 3], 1
+
+        cmp     p_ivlen, 7
+        je      _finish_nonce_move
+
+        cmp     p_ivlen, 8
+        je      _iv_length_8
+        cmp     p_ivlen, 9
+        je      _iv_length_9
+        cmp     p_ivlen, 10
+        je      _iv_length_10
+        cmp     p_ivlen, 11
+        je      _iv_length_11
+        cmp     p_ivlen, 12
+        je      _iv_length_12
+
+        ;; Bytes 8 - 13
+_iv_length_13:
+        pinsrb 	xcounter, [p_IV + 12], 13
+_iv_length_12:
+        pinsrb 	xcounter, [p_IV + 11], 12
+_iv_length_11:
+        pinsrd	xcounter, [p_IV + 7], 2
+        jmp     _finish_nonce_move
+_iv_length_10:
+        pinsrb	xcounter, [p_IV + 9], 10
+_iv_length_9:
+        pinsrb	xcounter, [p_IV + 8], 9
+_iv_length_8:
+        pinsrb	xcounter, [p_IV + 7], 8
+
+_finish_nonce_move:
+        ; last byte = 1
+        por     xcounter, [rel set_byte15]
+%else ;; CNTR/CNTR_BIT
+%ifndef LINUX
+	mov	num_bytes, [rsp + 8*5] ; arg5
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        push r12
+        push r13
+        push r14
+%endif
+
+	movdqa	xbyteswap, [rel byteswap_const]
+%ifidn %%CNTR_TYPE, CNTR
+        test    p_ivlen, 16
+        jnz     %%iv_is_16_bytes
+        ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+        mov     DWORD(tmp), 0x01000000
+        pinsrq  xcounter, [p_IV], 0
+        pinsrd  xcounter, [p_IV + 8], 2
+        pinsrd  xcounter, DWORD(tmp), 3
+
+%else ;; CNTR_BIT
+        ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
+        movdqu  xcounter, [p_IV]
+%endif
+%endif ;; CNTR/CNTR_BIT/CCM
+%%bswap_iv:
+	pshufb	xcounter, xbyteswap
+
+        ;; calculate len
+        ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        mov     r_bits, num_bits
+        add     num_bits, 7
+        shr     num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
+        and     r_bits, 7   ; Check if there are remainder bits (0-7)
+%endif
+	mov	tmp, num_bytes
+	and	tmp, 3*16
+	jz	%%chk             ; x4 > or < 15 (not 3 lines)
+
+	; 1 <= tmp <= 3
+	cmp	tmp, 2*16
+	jg	%%eq3
+	je	%%eq2
+%%eq1:
+	do_aes_load	1, %%CNTR_TYPE	; 1 block
+	add	p_out, 1*16
+        jmp     %%chk
+
+%%eq2:
+	do_aes_load	2, %%CNTR_TYPE	; 2 blocks
+	add	p_out, 2*16
+        jmp      %%chk
+
+%%eq3:
+	do_aes_load	3, %%CNTR_TYPE	; 3 blocks
+	add	p_out, 3*16
+	; fall through to chk
+%%chk:
+        and	num_bytes, ~(3*16)
+	jz	%%do_return2
+
+        cmp	num_bytes, 16
+        jb	%%last
+
+	; process multiples of 4 blocks
+	movdqa	xkey0, [p_keys + 0*16]
+	movdqa	xkey3, [p_keys + 3*16]
+	movdqa	xkey6, [p_keys + 6*16]
+	movdqa	xkey9, [p_keys + 9*16]
+
+align 32
+%%main_loop2:
+	; num_bytes is a multiple of 4 blocks + partial bytes
+	do_aes_noload	4, %%CNTR_TYPE
+	add	p_out,	4*16
+	sub	num_bytes, 4*16
+        cmp	num_bytes, 4*16
+	jae	%%main_loop2
+
+        ; Check if there is a partial block
+	or      num_bytes, num_bytes
+        jnz    %%last
+
+%%do_return2:
+%ifidn %%CNTR_TYPE, CCM
+	mov	rax, job
+	or	dword [rax + _status], STS_COMPLETED_AES
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        pop r14
+        pop r13
+        pop r12
+%endif
+
+	ret
+
+%%last:
+
+	; load partial block into XMM register
+	simd_load_sse_15_1 xpart, p_in, num_bytes
+
+%%final_ctr_enc:
+	; Encryption of a single partial block
+        pshufb	xcounter, xbyteswap
+        movdqa	xdata0, xcounter
+        pxor    xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 9
+        aesenc  xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+	; created keystream
+        aesenclast xdata0, [p_keys + 16*i]
+
+	; xor keystream with the message (scratch)
+        pxor    xdata0, xpart
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        ;; Check if there is a partial byte
+        or      r_bits, r_bits
+        jz      %%store_output
+
+        ;; Load output to get last partial byte
+        simd_load_sse_15_1 xtmp, p_out, num_bytes
+
+        ;; Save RCX in temporary GP register
+        mov     tmp, rcx
+        mov     mask, 0xff
+%ifidn r_bits, rcx
+%error "r_bits cannot be mapped to rcx!"
+%endif
+        mov     cl, BYTE(r_bits)
+        shr     mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+        mov     rcx, tmp
+
+        movq    xtmp2, mask
+
+        ;; Get number of full bytes in last block of 16 bytes
+        mov     tmp, num_bytes
+        dec     tmp
+        XPSLLB  xtmp2, tmp, xtmp3, tmp2
+        ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+        ;; in the partial byte
+
+        ;; Clear all the bits that do not need to be preserved from the output
+        pand    xtmp, xtmp2
+
+        ;; Clear the bits from the input that are not to be ciphered
+        pandn   xtmp2, xdata0
+        por     xtmp2, xtmp
+        movdqa  xdata0, xtmp2
+%endif
+
+%%store_output:
+        ; copy result into the output buffer
+        simd_store_sse_15 p_out, xdata0, num_bytes, tmp, rax
+
+        jmp	%%do_return2
+
+%%iv_is_16_bytes:
+        ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+        movdqu  xcounter, [p_IV]
+        jmp     %%bswap_iv
+%endmacro
+
+align 32
+%ifdef CNTR_CCM_SSE
+; JOB_AES_HMAC * aes_cntr_ccm_128_sse(JOB_AES_HMAC *job)
+; arg 1 : job
+MKGLOBAL(AES_CNTR_CCM_128,function,internal)
+AES_CNTR_CCM_128:
+        DO_CNTR CCM
+%else
+;; aes_cntr_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
+MKGLOBAL(AES_CNTR_128,function,internal)
+AES_CNTR_128:
+        DO_CNTR CNTR
+
+;; aes_cntr_bit_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bits, UINT64 iv_len)
+MKGLOBAL(AES_CNTR_BIT_128,function,internal)
+AES_CNTR_BIT_128:
+        DO_CNTR CNTR_BIT
+%endif ;; CNTR_CCM_SSE
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cntr_ccm_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_ccm_by4_sse.asm
new file mode 100644
index 000000000..8c54715ee
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_ccm_by4_sse.asm
@@ -0,0 +1,32 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define CNTR_CCM_SSE
+%ifndef AES_CNTR_CCM_128
+%define AES_CNTR_CCM_128 aes_cntr_ccm_128_sse
+%endif
+%include "sse/aes128_cntr_by4_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm
new file mode 100644
index 000000000..144de4f70
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm
@@ -0,0 +1,590 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_192_sse(void    *in,
+;                          UINT128 *IV,
+;                          UINT128  keys[13], // +1 over key length
+;                          void    *out,
+;                          UINT64   len_bytes);
+;
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+;
+%include "include/os.asm"
+
+
+%ifndef AES_CBC_DEC_192
+%define AES_CBC_DEC_192 aes_cbc_dec_192_sse
+%endif
+
+%define MOVDQ	movdqu
+
+%ifdef LINUX
+%define IN		rdi
+%define IV		rsi
+%define KEYS		rdx
+%define OUT		rcx
+%define LEN		r8
+%else
+%define IN		rcx
+%define IV		rdx
+%define KEYS		r8
+%define OUT		r9
+%define LEN		r10
+%endif
+
+%define IDX		rax
+%define TMP		IDX
+%define XDATA0		xmm0
+%define XDATA1		xmm1
+%define XDATA2		xmm2
+%define XDATA3		xmm3
+%define XKEY0		xmm4
+%define XKEY2		xmm5
+%define XKEY4		xmm6
+%define XKEY6		xmm7
+%define XKEY10		xmm8
+%define XIV		xmm9
+%define XSAVED0		xmm10
+%define XSAVED1		xmm11
+%define XSAVED2		xmm12
+%define XSAVED3		xmm13
+%define XKEY_A		xmm14
+%define XKEY_B		xmm15
+
+%define IV_TMP		XSAVED3
+
+section .text
+
+MKGLOBAL(AES_CBC_DEC_192,function,internal)
+AES_CBC_DEC_192:
+%ifndef LINUX
+	mov	LEN, [rsp + 8*5]
+%endif
+
+	mov	TMP, LEN
+	and	TMP, 3*16
+	jz	initial_4
+	cmp	TMP, 2*16
+	jb	initial_1
+	ja	initial_3
+
+initial_2:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XIV,     XDATA1
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, [KEYS + 1*16]	; 1. DEC
+	aesdec	XDATA1, [KEYS + 1*16]
+
+	mov	IDX, 2*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, [KEYS + 3*16]	; 3. DEC
+	aesdec	XDATA1, [KEYS + 3*16]
+
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, [KEYS + 5*16]	; 5. DEC
+	aesdec	XDATA1, [KEYS + 5*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, [KEYS + 7*16]	; 7. DEC
+	aesdec	XDATA1, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+	aesdec	XDATA1, XKEY_B
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, [KEYS + 9*16]	; 9. DEC
+	aesdec	XDATA1, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY10		; 10. DEC
+	aesdec	XDATA1, XKEY10
+
+	aesdec	XDATA0, [KEYS + 11*16]	; 11. DEC
+	aesdec	XDATA1, [KEYS + 11*16]
+
+	aesdeclast	XDATA0, [KEYS + 12*16]	; 12. DEC
+	aesdeclast	XDATA1, [KEYS + 12*16]
+
+	pxor	XDATA0, IV_TMP
+	pxor	XDATA1, XSAVED0
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+
+	cmp	LEN, 2*16
+	je	done
+	jmp	main_loop
+
+
+	align 16
+initial_1:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XIV,     XDATA0
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, [KEYS + 1*16]	; 1. DEC
+
+	mov	IDX, 1*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, [KEYS + 3*16]	; 3. DEC
+
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, [KEYS + 5*16]	; 5. DEC
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, [KEYS + 7*16]	; 7. DEC
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, [KEYS + 9*16]	; 9. DEC
+
+	aesdec	XDATA0, XKEY10		; 10. DEC
+
+	aesdec	XDATA0, [KEYS + 11*16]	; 11. DEC
+
+	aesdeclast XDATA0, [KEYS + 12*16]	; 12. DEC
+
+	pxor	XDATA0, IV_TMP
+
+	movdqu	[OUT + 0*16], XDATA0
+
+	cmp	LEN, 1*16
+	je	done
+	jmp	main_loop
+
+
+initial_3:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+	movdqu	XDATA2, [IN + 2*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XSAVED1, XDATA1
+	movdqa	XIV,     XDATA2
+
+	movdqa	XKEY_A, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, XKEY_A		; 1. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 3*16]
+	mov	IDX, 3*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+	aesdec	XDATA2, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, XKEY_A		; 3. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 5*16]
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+	aesdec	XDATA2, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, XKEY_A		; 5. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+	aesdec	XDATA2, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, XKEY_A		; 7. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+	aesdec	XDATA1, XKEY_B
+	aesdec	XDATA2, XKEY_B
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, XKEY_A		; 9. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 11*16]
+
+	aesdec	XDATA0, XKEY10		; 10. DEC
+	aesdec	XDATA1, XKEY10
+	aesdec	XDATA2, XKEY10
+
+	movdqa	XKEY_B, [KEYS + 12*16]
+
+	aesdec	XDATA0, XKEY_A		; 11. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 13*16]
+
+	aesdeclast	XDATA0, XKEY_B		; 12. DEC
+	aesdeclast	XDATA1, XKEY_B
+	aesdeclast	XDATA2, XKEY_B
+
+
+
+	pxor	XDATA0, IV_TMP
+	pxor	XDATA1, XSAVED0
+	pxor	XDATA2, XSAVED1
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+	movdqu	[OUT + 2*16], XDATA2
+
+	cmp	LEN, 3*16
+	je	done
+	jmp	main_loop
+
+
+	align 16
+initial_4:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+	movdqu	XDATA2, [IN + 2*16]
+	movdqu	XDATA3, [IN + 3*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XSAVED1, XDATA1
+	movdqa	XSAVED2, XDATA2
+	movdqa	XIV,     XDATA3
+
+	movdqa	XKEY_A, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+	pxor	XDATA3, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, XKEY_A		; 1. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 3*16]
+
+	mov	IDX, 4*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+	aesdec	XDATA2, XKEY2
+	aesdec	XDATA3, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, XKEY_A		; 3. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 5*16]
+
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+	aesdec	XDATA2, XKEY4
+	aesdec	XDATA3, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, XKEY_A		; 5. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+	aesdec	XDATA2, XKEY6
+	aesdec	XDATA3, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, XKEY_A		; 7. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+	aesdec	XDATA1, XKEY_B
+	aesdec	XDATA2, XKEY_B
+	aesdec	XDATA3, XKEY_B
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, XKEY_A		; 9. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 11*16]
+
+	aesdec	XDATA0, XKEY10	; 10. DEC
+	aesdec	XDATA1, XKEY10
+	aesdec	XDATA2, XKEY10
+	aesdec	XDATA3, XKEY10
+
+	movdqa	XKEY_B, [KEYS + 12*16]
+
+	aesdec	XDATA0, XKEY_A		; 11. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+
+
+	aesdeclast	XDATA0, XKEY_B		; 12. DEC
+	aesdeclast	XDATA1, XKEY_B
+	aesdeclast	XDATA2, XKEY_B
+	aesdeclast	XDATA3, XKEY_B
+
+	pxor	XDATA0, IV_TMP
+	pxor	XDATA1, XSAVED0
+	pxor	XDATA2, XSAVED1
+	pxor	XDATA3, XSAVED2
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+	movdqu	[OUT + 2*16], XDATA2
+	movdqu	[OUT + 3*16], XDATA3
+
+	cmp	LEN, 4*16
+	jz	done
+	jmp	main_loop
+
+	align 16
+main_loop:
+	; load cipher text
+	movdqu	XDATA0, [IN + IDX + 0*16]
+	movdqu	XDATA1, [IN + IDX + 1*16]
+	movdqu	XDATA2, [IN + IDX + 2*16]
+	movdqu	XDATA3, [IN + IDX + 3*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XSAVED1, XDATA1
+	movdqa	XSAVED2, XDATA2
+	movdqa	XSAVED3, XDATA3
+
+	movdqa	XKEY_A, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+	pxor	XDATA3, XKEY0
+
+	add	IDX, 4*16
+
+	aesdec	XDATA0, XKEY_A		; 1. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 3*16]
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+	aesdec	XDATA2, XKEY2
+	aesdec	XDATA3, XKEY2
+
+	aesdec	XDATA0, XKEY_A		; 3. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 5*16]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+	aesdec	XDATA2, XKEY4
+	aesdec	XDATA3, XKEY4
+
+	aesdec	XDATA0, XKEY_A		; 5. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+	aesdec	XDATA2, XKEY6
+	aesdec	XDATA3, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, XKEY_A		; 7. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+	aesdec	XDATA1, XKEY_B
+	aesdec	XDATA2, XKEY_B
+	aesdec	XDATA3, XKEY_B
+
+	aesdec	XDATA0, XKEY_A		; 9. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 11*16]
+
+	aesdec	XDATA0, XKEY10		; 10. DEC
+	aesdec	XDATA1, XKEY10
+	aesdec	XDATA2, XKEY10
+	aesdec	XDATA3, XKEY10
+
+	movdqa	XKEY_B, [KEYS + 12*16]
+
+	aesdec	XDATA0, XKEY_A		; 11. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	aesdeclast	XDATA0, XKEY_B		; 12. DECLAST
+	aesdeclast	XDATA1, XKEY_B
+	aesdeclast	XDATA2, XKEY_B
+	aesdeclast	XDATA3, XKEY_B
+
+	pxor	XDATA0, XIV
+	pxor	XDATA1, XSAVED0
+	pxor	XDATA2, XSAVED1
+	pxor	XDATA3, XSAVED2
+
+	movdqu	[OUT + IDX + 0*16 - 4*16], XDATA0
+	movdqu	[OUT + IDX + 1*16 - 4*16], XDATA1
+	movdqu	[OUT + IDX + 2*16 - 4*16], XDATA2
+	movdqu	[OUT + IDX + 3*16 - 4*16], XDATA3
+
+	movdqa	XIV, XSAVED3
+
+	CMP	IDX, LEN
+	jne	main_loop
+
+done:
+; Don't write back IV
+;	movdqu	[IV], XIV
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm
new file mode 100644
index 000000000..eaa89f21e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm
@@ -0,0 +1,470 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+%include "include/reg_sizes.asm"
+
+; routine to do AES192 CNTR enc/decrypt "by4"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+%ifndef AES_CNTR_192
+%define AES_CNTR_192 aes_cntr_192_sse
+%define AES_CNTR_BIT_192 aes_cntr_bit_192_sse
+%endif
+
+extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+
+%define CONCAT(a,b) a %+ b
+%define MOVDQ movdqu
+
+%define xdata0	xmm0
+%define xdata1	xmm1
+%define xpart	xmm1
+%define xdata2	xmm2
+%define xdata3	xmm3
+%define xdata4	xmm4
+%define xdata5	xmm5
+%define xdata6	xmm6
+%define xdata7	xmm7
+%define xcounter xmm8
+%define xtmp    xmm8
+%define xbyteswap xmm9
+%define xtmp2   xmm9
+%define xkey0 	xmm10
+%define xtmp3   xmm10
+%define xkey4 	xmm11
+%define xkey8 	xmm12
+%define xkey12	xmm13
+%define xkeyA	xmm14
+%define xkeyB	xmm15
+
+%ifdef LINUX
+%define p_in	  rdi
+%define p_IV	  rsi
+%define p_keys	  rdx
+%define p_out	  rcx
+%define num_bytes r8
+%define num_bits  r8
+%define p_ivlen   r9
+%else
+%define p_in	  rcx
+%define p_IV	  rdx
+%define p_keys	  r8
+%define p_out	  r9
+%define num_bytes r10
+%define num_bits  r10
+%define p_ivlen   qword [rsp + 8*6]
+%endif
+
+%define tmp	r11
+
+%define r_bits   r12
+%define tmp2    r13
+%define mask    r14
+
+%macro do_aes_load 2
+	do_aes %1, %2, 1
+%endmacro
+
+%macro do_aes_noload 2
+	do_aes %1, %2, 0
+%endmacro
+
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 3
+%define %%by %1
+%define %%cntr_type %2
+%define %%load_keys %3
+
+%if (%%load_keys)
+	movdqa	xkey0, [p_keys + 0*16]
+%endif
+
+	movdqa	xdata0, xcounter
+	pshufb	xdata0, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+	movdqa	CONCAT(xdata,i), xcounter
+	paddd	CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
+	pshufb	CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 1*16]
+
+	pxor	xdata0, xkey0
+%ifidn %%cntr_type, CNTR_BIT
+	paddq	xcounter, [rel CONCAT(ddq_add_,%%by)]
+%else
+	paddd	xcounter, [rel CONCAT(ddq_add_,%%by)]
+%endif
+
+%assign i 1
+%rep (%%by - 1)
+	pxor	CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 1
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 2
+%assign i (i+1)
+%endrep
+
+	add	p_in, 16*%%by
+
+%if (%%load_keys)
+	movdqa	xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 3
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey4		; key 4
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 6*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 5
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 6
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 7
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey8		; key 8
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 9
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 11*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 10
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey12, [p_keys + 12*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 11
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+	aesenclast	CONCAT(xdata,i), xkey12	; key 12
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
+	MOVDQ	xkeyB, [p_in + j*16 - 16*%%by]
+	pxor	CONCAT(xdata,i), xkeyA
+	pxor	CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
+	pxor	CONCAT(xdata,i), xkeyA
+%endif
+
+%ifidn %%cntr_type, CNTR_BIT
+        ;; check if this is the end of the message
+        mov     tmp, num_bytes
+        and     tmp, ~(%%by*16)
+        jnz     %%skip_preserve
+        ;; Check if there is a partial byte
+        or      r_bits, r_bits
+        jz      %%skip_preserve
+
+%assign idx (%%by - 1)
+        ;; Load output to get last partial byte
+        movdqu         xtmp, [p_out + idx * 16]
+
+        ;; Save RCX in temporary GP register
+        mov             tmp, rcx
+        mov             mask, 0xff
+        mov             cl, BYTE(r_bits)
+        shr             mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+        mov             rcx, tmp
+
+        movq            xtmp2, mask
+        pslldq          xtmp2, 15
+        ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+        ;; in the partial byte
+
+        ;; Clear all the bits that do not need to be preserved from the output
+        pand            xtmp, xtmp2
+
+        ;; Clear all bits from the input that are not to be ciphered
+        pandn	        xtmp2, CONCAT(xdata, idx)
+        por             xtmp2, xtmp
+        movdqa		CONCAT(xdata, idx), xtmp2
+
+%%skip_preserve:
+%endif
+
+%assign i 0
+%rep %%by
+	MOVDQ	[p_out  + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+
+;; Macro performing AES-CTR.
+;;
+%macro DO_CNTR 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+
+%ifndef LINUX
+	mov	num_bytes, [rsp + 8*5]
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        push r12
+        push r13
+        push r14
+%endif
+
+	movdqa	xbyteswap, [rel byteswap_const]
+%ifidn %%CNTR_TYPE, CNTR
+        test    p_ivlen, 16
+        jnz     %%iv_is_16_bytes
+        ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+        mov     DWORD(tmp), 0x01000000
+        pinsrq  xcounter, [p_IV], 0
+        pinsrd  xcounter, [p_IV + 8], 2
+        pinsrd  xcounter, DWORD(tmp), 3
+
+%else ;; CNTR_BIT
+        ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
+        movdqu  xcounter, [p_IV]
+%endif
+
+%%bswap_iv:
+	pshufb	xcounter, xbyteswap
+
+        ;; calculate len
+        ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        mov     r_bits, num_bits
+        add     num_bits, 7
+        shr     num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
+        and     r_bits, 7   ; Check if there are remainder bits (0-7)
+%endif
+	mov	tmp, num_bytes
+	and	tmp, 3*16
+	jz	%%chk             ; x4 > or < 15 (not 3 lines)
+
+	; 1 <= tmp <= 3
+	cmp	tmp, 2*16
+	jg	%%eq3
+	je	%%eq2
+%%eq1:
+	do_aes_load	1, %%CNTR_TYPE
+	add	p_out, 1*16
+        jmp     %%chk
+
+%%eq2:
+	do_aes_load	2, %%CNTR_TYPE
+	add	p_out, 2*16
+        jmp      %%chk
+
+%%eq3:
+	do_aes_load	3, %%CNTR_TYPE
+	add	p_out, 3*16
+	; fall through to chk
+%%chk:
+        and	num_bytes, ~(3*16)
+	jz	%%do_return2
+
+        cmp	num_bytes, 16
+        jb	%%last
+
+	; process multiples of 4 blocks
+	movdqa	xkey0, [p_keys + 0*16]
+	movdqa	xkey4, [p_keys + 4*16]
+	movdqa	xkey8, [p_keys + 8*16]
+	movdqa	xkey12, [p_keys + 12*16]
+
+align 32
+%%main_loop2:
+	; num_bytes is a multiple of 4 blocks + partial bytes
+	do_aes_noload	4, %%CNTR_TYPE
+	add	p_out,	4*16
+	sub	num_bytes, 4*16
+        cmp	num_bytes, 4*16
+	jae	%%main_loop2
+
+        ; Check if there is a partial block
+	or      num_bytes, num_bytes
+        jnz    %%last
+
+%%do_return2:
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        pop r14
+        pop r13
+        pop r12
+%endif
+
+	ret
+
+%%last:
+
+	; load partial block into XMM register
+	simd_load_sse_15_1 xpart, p_in, num_bytes
+
+%%final_ctr_enc:
+	; Encryption of a single partial block
+        pshufb	xcounter, xbyteswap
+        movdqa	xdata0, xcounter
+        pxor    xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 11
+        aesenc  xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+	; created keystream
+        aesenclast xdata0, [p_keys + 16*i]
+
+	; xor keystream with the message (scratch)
+        pxor    xdata0, xpart
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        ;; Check if there is a partial byte
+        or      r_bits, r_bits
+        jz      %%store_output
+
+        ;; Load output to get last partial byte
+        simd_load_sse_15_1 xtmp, p_out, num_bytes
+
+        ;; Save RCX in temporary GP register
+        mov     tmp, rcx
+        mov     mask, 0xff
+%ifidn r_bits, rcx
+%error "r_bits cannot be mapped to rcx!"
+%endif
+        mov     cl, BYTE(r_bits)
+        shr     mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+        mov     rcx, tmp
+
+        movq    xtmp2, mask
+
+        ;; Get number of full bytes in last block of 16 bytes
+        mov     tmp, num_bytes
+        dec     tmp
+        XPSLLB  xtmp2, tmp, xtmp3, tmp2
+        ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+        ;; in the partial byte
+
+        ;; Clear all the bits that do not need to be preserved from the output
+        pand    xtmp, xtmp2
+
+        ;; Clear the bits from the input that are not to be ciphered
+        pandn   xtmp2, xdata0
+        por     xtmp2, xtmp
+        movdqa  xdata0, xtmp2
+%endif
+
+%%store_output:
+        ; copy result into the output buffer
+        simd_store_sse_15 p_out, xdata0, num_bytes, tmp, rax
+
+        jmp	%%do_return2
+
+%%iv_is_16_bytes:
+        ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+        movdqu  xcounter, [p_IV]
+        jmp     %%bswap_iv
+%endmacro
+
+align 32
+;; aes_cntr_192_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
+MKGLOBAL(AES_CNTR_192,function,internal)
+AES_CNTR_192:
+        DO_CNTR CNTR
+
+;; aes_cntr_bit_192_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bits, UINT64 iv_len)
+MKGLOBAL(AES_CNTR_BIT_192,function,internal)
+AES_CNTR_BIT_192:
+        DO_CNTR CNTR_BIT
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm
new file mode 100644
index 000000000..c82a4f58a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm
@@ -0,0 +1,634 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_256_sse(void    *in,
+;                          UINT128 *IV,
+;                          UINT128  keys[15],
+;                          void    *out,
+;                          UINT64   len_bytes);
+;
+; arg 1: rcx: pointer to input (cipher text)
+; arg 2: rdx: pointer to IV
+; arg 3: r8:  pointer to keys
+; arg 4: r9:  pointer to output (plain text)
+; arg 5: sp:  length in bytes (multiple of 16)
+;
+
+%include "include/os.asm"
+
+%ifndef AES_CBC_DEC_256
+%define AES_CBC_DEC_256 aes_cbc_dec_256_sse
+%endif
+
+%define MOVDQ	movdqu
+
+%ifdef LINUX
+%define IN		rdi
+%define IV		rsi
+%define KEYS		rdx
+%define OUT		rcx
+%define LEN		r8
+%else
+%define IN		rcx
+%define IV		rdx
+%define KEYS		r8
+%define OUT		r9
+%define LEN		r10
+%endif
+
+%define IDX		rax
+%define TMP		IDX
+%define XDATA0		xmm0
+%define XDATA1		xmm1
+%define XDATA2		xmm2
+%define XDATA3		xmm3
+%define XKEY0		xmm4
+%define XKEY2		xmm5
+%define XKEY4		xmm6
+%define XKEY6		xmm7
+%define XKEY10		xmm8
+%define XIV		xmm9
+%define XSAVED0		xmm10
+%define XSAVED1		xmm11
+%define XSAVED2		xmm12
+%define XSAVED3		xmm13
+%define XKEY_A		xmm14
+%define XKEY_B		xmm15
+
+%define IV_TMP		XSAVED3
+
+section .text
+
+MKGLOBAL(AES_CBC_DEC_256,function,internal)
+AES_CBC_DEC_256:
+%ifndef LINUX
+	mov	LEN, [rsp + 8*5]
+%endif
+
+	mov	TMP, LEN
+	and	TMP, 3*16
+	jz	initial_4
+	cmp	TMP, 2*16
+	jb	initial_1
+	ja	initial_3
+
+initial_2:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XIV,     XDATA1
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, [KEYS + 1*16]	; 1. DEC
+	aesdec	XDATA1, [KEYS + 1*16]
+
+	mov	IDX, 2*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, [KEYS + 3*16]	; 3. DEC
+	aesdec	XDATA1, [KEYS + 3*16]
+
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, [KEYS + 5*16]	; 5. DEC
+	aesdec	XDATA1, [KEYS + 5*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, [KEYS + 7*16]	; 7. DEC
+	aesdec	XDATA1, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+	aesdec	XDATA1, XKEY_B
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, [KEYS + 9*16]	; 9. DEC
+	aesdec	XDATA1, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY10		; 10. DEC
+	aesdec	XDATA1, XKEY10
+
+	aesdec	XDATA0, [KEYS + 11*16]	; 11. DEC
+	aesdec	XDATA1, [KEYS + 11*16]
+
+	aesdec	XDATA0, [KEYS + 12*16]	; 12. DEC
+	aesdec	XDATA1, [KEYS + 12*16]
+
+	aesdec	XDATA0, [KEYS + 13*16]	; 13. DEC
+	aesdec	XDATA1, [KEYS + 13*16]
+
+	aesdeclast	XDATA0, [KEYS + 14*16]	; 14. DEC
+	aesdeclast	XDATA1, [KEYS + 14*16]
+
+	pxor	XDATA0, IV_TMP
+	pxor	XDATA1, XSAVED0
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+
+	cmp	LEN, 2*16
+	je	done
+	jmp	main_loop
+
+
+	align 16
+initial_1:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XIV,     XDATA0
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, [KEYS + 1*16]	; 1. DEC
+
+	mov	IDX, 1*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, [KEYS + 3*16]	; 3. DEC
+
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, [KEYS + 5*16]	; 5. DEC
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, [KEYS + 7*16]	; 7. DEC
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, [KEYS + 9*16]	; 9. DEC
+
+	aesdec	XDATA0, XKEY10		; 10. DEC
+
+	aesdec	XDATA0, [KEYS + 11*16]	; 11. DEC
+
+	aesdec	XDATA0, [KEYS + 12*16]	; 12. DEC
+
+	aesdec	XDATA0, [KEYS + 13*16]	; 13. DEC
+
+	aesdeclast	XDATA0, [KEYS + 14*16]	; 14. DEC
+
+	pxor	XDATA0, IV_TMP
+
+	movdqu	[OUT + 0*16], XDATA0
+
+	cmp	LEN, 1*16
+	je	done
+	jmp	main_loop
+
+
+initial_3:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+	movdqu	XDATA2, [IN + 2*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XSAVED1, XDATA1
+	movdqa	XIV,     XDATA2
+
+	movdqa	XKEY_A, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, XKEY_A		; 1. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 3*16]
+	mov	IDX, 3*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+	aesdec	XDATA2, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, XKEY_A		; 3. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 5*16]
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+	aesdec	XDATA2, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, XKEY_A		; 5. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+	aesdec	XDATA2, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, XKEY_A		; 7. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+	aesdec	XDATA1, XKEY_B
+	aesdec	XDATA2, XKEY_B
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, XKEY_A		; 9. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 11*16]
+
+	aesdec	XDATA0, XKEY10		; 10. DEC
+	aesdec	XDATA1, XKEY10
+	aesdec	XDATA2, XKEY10
+
+	movdqa	XKEY_B, [KEYS + 12*16]
+
+	aesdec	XDATA0, XKEY_A		; 11. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 13*16]
+
+	aesdec	XDATA0, XKEY_B		; 12. DEC
+	aesdec	XDATA1, XKEY_B
+	aesdec	XDATA2, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 14*16]
+
+	aesdec	XDATA0, XKEY_A		; 13. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+
+	aesdeclast	XDATA0, XKEY_B		; 14. DEC
+	aesdeclast	XDATA1, XKEY_B
+	aesdeclast	XDATA2, XKEY_B
+
+	pxor	XDATA0, IV_TMP
+	pxor	XDATA1, XSAVED0
+	pxor	XDATA2, XSAVED1
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+	movdqu	[OUT + 2*16], XDATA2
+
+	cmp	LEN, 3*16
+	je	done
+	jmp	main_loop
+
+
+	align 16
+initial_4:
+	; load cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+	movdqu	XDATA2, [IN + 2*16]
+	movdqu	XDATA3, [IN + 3*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XSAVED1, XDATA1
+	movdqa	XSAVED2, XDATA2
+	movdqa	XIV,     XDATA3
+
+	movdqa	XKEY_A, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+	pxor	XDATA3, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	aesdec	XDATA0, XKEY_A		; 1. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 3*16]
+
+	mov	IDX, 4*16
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+	aesdec	XDATA2, XKEY2
+	aesdec	XDATA3, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	aesdec	XDATA0, XKEY_A		; 3. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 5*16]
+
+	movdqu	IV_TMP, [IV]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+	aesdec	XDATA2, XKEY4
+	aesdec	XDATA3, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	aesdec	XDATA0, XKEY_A		; 5. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+	aesdec	XDATA2, XKEY6
+	aesdec	XDATA3, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, XKEY_A		; 7. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+	aesdec	XDATA1, XKEY_B
+	aesdec	XDATA2, XKEY_B
+	aesdec	XDATA3, XKEY_B
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	aesdec	XDATA0, XKEY_A		; 9. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 11*16]
+
+	aesdec	XDATA0, XKEY10	; 10. DEC
+	aesdec	XDATA1, XKEY10
+	aesdec	XDATA2, XKEY10
+	aesdec	XDATA3, XKEY10
+
+	movdqa	XKEY_B, [KEYS + 12*16]
+
+	aesdec	XDATA0, XKEY_A		; 11. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 13*16]
+
+	aesdec	XDATA0, XKEY_B		; 12. DEC
+	aesdec	XDATA1, XKEY_B
+	aesdec	XDATA2, XKEY_B
+	aesdec	XDATA3, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 14*16]
+
+	aesdec	XDATA0, XKEY_A		; 13. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	aesdeclast	XDATA0, XKEY_B		; 14. DEC
+	aesdeclast	XDATA1, XKEY_B
+	aesdeclast	XDATA2, XKEY_B
+	aesdeclast	XDATA3, XKEY_B
+
+	pxor	XDATA0, IV_TMP
+	pxor	XDATA1, XSAVED0
+	pxor	XDATA2, XSAVED1
+	pxor	XDATA3, XSAVED2
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+	movdqu	[OUT + 2*16], XDATA2
+	movdqu	[OUT + 3*16], XDATA3
+
+	cmp	LEN, 4*16
+	jz	done
+	jmp	main_loop
+
+	align 16
+main_loop:
+	; load cipher text
+	movdqu	XDATA0, [IN + IDX + 0*16]
+	movdqu	XDATA1, [IN + IDX + 1*16]
+	movdqu	XDATA2, [IN + IDX + 2*16]
+	movdqu	XDATA3, [IN + IDX + 3*16]
+
+	; save cipher text
+	movdqa	XSAVED0, XDATA0
+	movdqa	XSAVED1, XDATA1
+	movdqa	XSAVED2, XDATA2
+	movdqa	XSAVED3, XDATA3
+
+	movdqa	XKEY_A, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+	pxor	XDATA3, XKEY0
+
+	add	IDX, 4*16
+
+	aesdec	XDATA0, XKEY_A		; 1. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 3*16]
+
+	aesdec	XDATA0, XKEY2		; 2. DEC
+	aesdec	XDATA1, XKEY2
+	aesdec	XDATA2, XKEY2
+	aesdec	XDATA3, XKEY2
+
+	aesdec	XDATA0, XKEY_A		; 3. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 5*16]
+
+	aesdec	XDATA0, XKEY4		; 4. DEC
+	aesdec	XDATA1, XKEY4
+	aesdec	XDATA2, XKEY4
+	aesdec	XDATA3, XKEY4
+
+	aesdec	XDATA0, XKEY_A		; 5. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 7*16]
+
+	aesdec	XDATA0, XKEY6		; 6. DEC
+	aesdec	XDATA1, XKEY6
+	aesdec	XDATA2, XKEY6
+	aesdec	XDATA3, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	aesdec	XDATA0, XKEY_A		; 7. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 9*16]
+
+	aesdec	XDATA0, XKEY_B		; 8. DEC
+	aesdec	XDATA1, XKEY_B
+	aesdec	XDATA2, XKEY_B
+	aesdec	XDATA3, XKEY_B
+
+	aesdec	XDATA0, XKEY_A		; 9. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 11*16]
+
+	aesdec	XDATA0, XKEY10		; 10. DEC
+	aesdec	XDATA1, XKEY10
+	aesdec	XDATA2, XKEY10
+	aesdec	XDATA3, XKEY10
+
+	movdqa	XKEY_B, [KEYS + 12*16]
+
+	aesdec	XDATA0, XKEY_A		; 11. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 13*16]
+
+	aesdec	XDATA0, XKEY_B		; 12. DEC
+	aesdec	XDATA1, XKEY_B
+	aesdec	XDATA2, XKEY_B
+	aesdec	XDATA3, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 14*16]
+
+	aesdec	XDATA0, XKEY_A		; 13. DEC
+	aesdec	XDATA1, XKEY_A
+	aesdec	XDATA2, XKEY_A
+	aesdec	XDATA3, XKEY_A
+
+	aesdeclast	XDATA0, XKEY_B		; 14. DEC
+	aesdeclast	XDATA1, XKEY_B
+	aesdeclast	XDATA2, XKEY_B
+	aesdeclast	XDATA3, XKEY_B
+
+	pxor	XDATA0, XIV
+	pxor	XDATA1, XSAVED0
+	pxor	XDATA2, XSAVED1
+	pxor	XDATA3, XSAVED2
+
+	movdqu	[OUT + IDX + 0*16 - 4*16], XDATA0
+	movdqu	[OUT + IDX + 1*16 - 4*16], XDATA1
+	movdqu	[OUT + IDX + 2*16 - 4*16], XDATA2
+	movdqu	[OUT + IDX + 3*16 - 4*16], XDATA3
+
+	movdqa	XIV, XSAVED3
+
+	CMP	IDX, LEN
+	jne	main_loop
+
+done:
+; Don't write back IV
+;	movdqu	[IV], XIV
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm
new file mode 100644
index 000000000..6d8f211f7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm
@@ -0,0 +1,483 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+; routine to do AES256 CNTR enc/decrypt "by4"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+%ifndef AES_CNTR_256
+%define AES_CNTR_256 aes_cntr_256_sse
+%define AES_CNTR_BIT_256 aes_cntr_bit_256_sse
+%endif
+
+extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+
+%define CONCAT(a,b) a %+ b
+%define MOVDQ movdqu
+
+%define xdata0	xmm0
+%define xdata1	xmm1
+%define xpart	xmm1
+%define xdata2	xmm2
+%define xdata3	xmm3
+%define xdata4	xmm4
+%define xdata5	xmm5
+%define xdata6	xmm6
+%define xdata7	xmm7
+%define xcounter xmm8
+%define xtmp    xmm8
+%define xbyteswap xmm9
+%define xtmp2   xmm9
+%define xkey0 	xmm10
+%define xtmp3   xmm10
+%define xkey4 	xmm11
+%define xkey8 	xmm12
+%define xkey12	xmm13
+%define xkeyA	xmm14
+%define xkeyB	xmm15
+
+%ifdef LINUX
+%define p_in	  rdi
+%define p_IV	  rsi
+%define p_keys	  rdx
+%define p_out	  rcx
+%define num_bytes r8
+%define num_bits  r8
+%define p_ivlen   r9
+%else
+%define p_in	  rcx
+%define p_IV	  rdx
+%define p_keys	  r8
+%define p_out	  r9
+%define num_bytes r10
+%define num_bits  r10
+%define p_ivlen   qword [rsp + 8*6]
+%endif
+
+%define tmp	r11
+
+%define r_bits   r12
+%define tmp2    r13
+%define mask    r14
+
+%macro do_aes_load 2
+	do_aes %1, %2, 1
+%endmacro
+
+%macro do_aes_noload 2
+	do_aes %1, %2, 0
+%endmacro
+
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 3
+%define %%by %1
+%define %%cntr_type %2
+%define %%load_keys %3
+
+%if (%%load_keys)
+	movdqa	xkey0, [p_keys + 0*16]
+%endif
+
+	movdqa	xdata0, xcounter
+	pshufb	xdata0, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+	movdqa	CONCAT(xdata,i), xcounter
+	paddd	CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
+	pshufb	CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 1*16]
+
+	pxor	xdata0, xkey0
+%ifidn %%cntr_type, CNTR_BIT
+	paddq	xcounter, [rel CONCAT(ddq_add_,%%by)]
+%else
+	paddd	xcounter, [rel CONCAT(ddq_add_,%%by)]
+%endif
+
+%assign i 1
+%rep (%%by - 1)
+	pxor	CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 1
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 2
+%assign i (i+1)
+%endrep
+
+	add	p_in, 16*%%by
+
+%if (%%load_keys)
+	movdqa	xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 3
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey4		; key 4
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 6*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 5
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 6
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 7
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey8		; key 8
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 9
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 11*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 10
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey12, [p_keys + 12*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 11
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 13*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey12	; key 12
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 14*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 13
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+	aesenclast	CONCAT(xdata,i), xkeyB	; key 14
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
+	MOVDQ	xkeyB, [p_in + j*16 - 16*%%by]
+	pxor	CONCAT(xdata,i), xkeyA
+	pxor	CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
+	pxor	CONCAT(xdata,i), xkeyA
+%endif
+
+%ifidn %%cntr_type, CNTR_BIT
+        ;; check if this is the end of the message
+        mov     tmp, num_bytes
+        and     tmp, ~(%%by*16)
+        jnz     %%skip_preserve
+        ;; Check if there is a partial byte
+        or      r_bits, r_bits
+        jz      %%skip_preserve
+
+%assign idx (%%by - 1)
+        ;; Load output to get last partial byte
+        movdqu         xtmp, [p_out + idx * 16]
+
+        ;; Save RCX in temporary GP register
+        mov             tmp, rcx
+        mov             mask, 0xff
+        mov             cl, BYTE(r_bits)
+        shr             mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+        mov             rcx, tmp
+
+        movq            xtmp2, mask
+        pslldq          xtmp2, 15
+        ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+        ;; in the partial byte
+
+        ;; Clear all the bits that do not need to be preserved from the output
+        pand            xtmp, xtmp2
+
+        ;; Clear all bits from the input that are not to be ciphered
+        pandn	        xtmp2, CONCAT(xdata, idx)
+        por             xtmp2, xtmp
+        movdqa		CONCAT(xdata, idx), xtmp2
+
+%%skip_preserve:
+%endif
+
+%assign i 0
+%rep %%by
+	MOVDQ	[p_out  + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+
+;; Macro performing AES-CTR.
+;;
+%macro DO_CNTR 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+
+%ifndef LINUX
+	mov	num_bytes, [rsp + 8*5]
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        push r12
+        push r13
+        push r14
+%endif
+
+	movdqa	xbyteswap, [rel byteswap_const]
+%ifidn %%CNTR_TYPE, CNTR
+        test    p_ivlen, 16
+        jnz     %%iv_is_16_bytes
+        ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+        mov     DWORD(tmp), 0x01000000
+        pinsrq  xcounter, [p_IV], 0
+        pinsrd  xcounter, [p_IV + 8], 2
+        pinsrd  xcounter, DWORD(tmp), 3
+
+%else ;; CNTR_BIT
+        ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
+        movdqu  xcounter, [p_IV]
+%endif
+
+%%bswap_iv:
+	pshufb	xcounter, xbyteswap
+
+        ;; calculate len
+        ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        mov     r_bits, num_bits
+        add     num_bits, 7
+        shr     num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
+        and     r_bits, 7   ; Check if there are remainder bits (0-7)
+%endif
+	mov	tmp, num_bytes
+	and	tmp, 3*16
+	jz	%%chk             ; x4 > or < 15 (not 3 lines)
+
+	; 1 <= tmp <= 3
+	cmp	tmp, 2*16
+	jg	%%eq3
+	je	%%eq2
+%%eq1:
+	do_aes_load	1, %%CNTR_TYPE
+	add	p_out, 1*16
+        jmp     %%chk
+
+%%eq2:
+	do_aes_load	2, %%CNTR_TYPE
+	add	p_out, 2*16
+        jmp      %%chk
+
+%%eq3:
+	do_aes_load	3, %%CNTR_TYPE
+	add	p_out, 3*16
+	; fall through to chk
+%%chk:
+        and	num_bytes, ~(3*16)
+	jz	%%do_return2
+
+        cmp	num_bytes, 16
+        jb	%%last
+
+	; process multiples of 4 blocks
+	movdqa	xkey0, [p_keys + 0*16]
+	movdqa	xkey4, [p_keys + 4*16]
+	movdqa	xkey8, [p_keys + 8*16]
+	movdqa	xkey12, [p_keys + 12*16]
+
+align 32
+%%main_loop2:
+	; num_bytes is a multiple of 4 blocks + partial bytes
+	do_aes_noload	4, %%CNTR_TYPE
+	add	p_out,	4*16
+	sub	num_bytes, 4*16
+        cmp	num_bytes, 4*16
+	jae	%%main_loop2
+
+        ; Check if there is a partial block
+	or      num_bytes, num_bytes
+        jnz    %%last
+
+%%do_return2:
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        pop r14
+        pop r13
+        pop r12
+%endif
+
+	ret
+
+%%last:
+
+	; load partial block into XMM register
+	simd_load_sse_15_1 xpart, p_in, num_bytes
+
+%%final_ctr_enc:
+	; Encryption of a single partial block
+	pshufb	xcounter, xbyteswap
+	movdqa	xdata0, xcounter
+	pxor	xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 13
+        aesenc  xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+	; created keystream
+        aesenclast xdata0, [p_keys + 16*i]
+
+	; xor keystream with the message (scratch)
+        pxor    xdata0, xpart
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+        ;; Check if there is a partial byte
+        or      r_bits, r_bits
+        jz      %%store_output
+
+        ;; Load output to get last partial byte
+        simd_load_sse_15_1 xtmp, p_out, num_bytes
+
+        ;; Save RCX in temporary GP register
+        mov     tmp, rcx
+        mov     mask, 0xff
+%ifidn r_bits, rcx
+%error "r_bits cannot be mapped to rcx!"
+%endif
+        mov     cl, BYTE(r_bits)
+        shr     mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+        mov     rcx, tmp
+
+        movq    xtmp2, mask
+
+        ;; Get number of full bytes in last block of 16 bytes
+        mov     tmp, num_bytes
+        dec     tmp
+        XPSLLB  xtmp2, tmp, xtmp3, tmp2
+        ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+        ;; in the partial byte
+
+        ;; Clear all the bits that do not need to be preserved from the output
+        pand    xtmp, xtmp2
+
+        ;; Clear the bits from the input that are not to be ciphered
+        pandn   xtmp2, xdata0
+        por     xtmp2, xtmp
+        movdqa  xdata0, xtmp2
+%endif
+
+%%store_output:
+        ; copy result into the output buffer
+        simd_store_sse_15 p_out, xdata0, num_bytes, tmp, rax
+
+        jmp	%%do_return2
+
+%%iv_is_16_bytes:
+        ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+        movdqu  xcounter, [p_IV]
+        jmp     %%bswap_iv
+%endmacro
+
+align 32
+;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
+MKGLOBAL(AES_CNTR_256,function,internal)
+AES_CNTR_256:
+        DO_CNTR CNTR
+
+;; aes_cntr_bit_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bits, UINT64 iv_len)
+MKGLOBAL(AES_CNTR_BIT_256,function,internal)
+AES_CNTR_BIT_256:
+        DO_CNTR CNTR_BIT
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm
new file mode 100644
index 000000000..4b07ecf90
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm
@@ -0,0 +1,380 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; Routine to do a 128 bit CBC AES encryption / CBC-MAC digest computation
+;;; processes 4 buffers at a time, single data structure as input
+;;; Updates In and Out pointers at end
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define	MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+	MOVDQ	XTMP, %2
+	pxor	%1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS {
+;;     void*    in[8];
+;;     void*    out[8];
+;;     UINT128* keys[8];
+;;     UINT128  IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_128_x4(AES_ARGS *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save:	resq	8
+endstruc
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define arg3	rdx
+%define arg4	rcx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define arg3	rdi             ;r8
+%define arg4	rsi             ;r9
+%endif
+
+%define ARG	arg1
+%define LEN	arg2
+
+%define IDX	rax
+
+%define IN0	r8
+%define KEYS0	rbx
+
+%define IN1	r10
+%define KEYS1	arg3
+
+%define IN2	r12
+%define KEYS2	arg4
+
+%define IN3	r14
+%define KEYS3	rbp
+
+%ifndef CBC_MAC
+;; No cipher text write back for CBC-MAC
+%define OUT0	r9
+%define OUT1	r11
+%define OUT2	r13
+%define OUT3	r15
+%endif
+
+%define XDATA0		xmm0
+%define XDATA1		xmm1
+%define XDATA2		xmm2
+%define XDATA3		xmm3
+
+%define XKEY0_3		xmm4
+%define XKEY0_6		[KEYS0 + 16*6]
+%define XTMP		xmm5
+%define XKEY0_9		xmm6
+
+%define XKEY1_3		xmm7
+%define XKEY1_6		xmm8
+%define XKEY1_9		xmm9
+
+%define XKEY2_3		xmm10
+%define XKEY2_6		xmm11
+%define XKEY2_9		xmm12
+
+%define XKEY3_3		xmm13
+%define XKEY3_6		xmm14
+%define XKEY3_9		xmm15
+
+section .text
+
+%ifndef AES_CBC_ENC_X4
+
+%ifdef CBC_MAC
+MKGLOBAL(aes128_cbc_mac_x4,function,internal)
+aes128_cbc_mac_x4:
+%else
+MKGLOBAL(aes_cbc_enc_128_x4,function,internal)
+aes_cbc_enc_128_x4:
+%endif
+
+%else ;; AES_CBC_ENC_X4 already defined
+
+%ifdef CBC_MAC
+MKGLOBAL(aes128_cbc_mac_x4_no_aesni,function,internal)
+aes128_cbc_mac_x4_no_aesni:
+%else
+MKGLOBAL(aes_cbc_enc_128_x4_no_aesni,function,internal)
+aes_cbc_enc_128_x4_no_aesni:
+%endif
+
+%endif
+	sub	rsp, STACK_size
+	mov	[rsp + _gpr_save + 8*0], rbp
+%ifdef CBC_MAC
+	mov	[rsp + _gpr_save + 8*1], rbx
+	mov	[rsp + _gpr_save + 8*2], r12
+	mov	[rsp + _gpr_save + 8*3], r13
+	mov	[rsp + _gpr_save + 8*4], r14
+	mov	[rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*6], rsi
+	mov	[rsp + _gpr_save + 8*7], rdi
+%endif
+%endif
+	mov	IDX, 16
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+	mov	        IN0,	[ARG + _aesarg_in + 8*0]
+	mov	        IN1,	[ARG + _aesarg_in + 8*1]
+	mov	        IN2,	[ARG + _aesarg_in + 8*2]
+	mov	        IN3,	[ARG + _aesarg_in + 8*3]
+
+	MOVDQ		XDATA0, [IN0]		; load first block of plain text
+	MOVDQ		XDATA1, [IN1]		; load first block of plain text
+	MOVDQ		XDATA2, [IN2]		; load first block of plain text
+	MOVDQ		XDATA3, [IN3]		; load first block of plain text
+
+	mov		KEYS0,	[ARG + _aesarg_keys + 8*0]
+	mov		KEYS1,	[ARG + _aesarg_keys + 8*1]
+	mov		KEYS2,	[ARG + _aesarg_keys + 8*2]
+	mov		KEYS3,	[ARG + _aesarg_keys + 8*3]
+
+	pxor		XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+	pxor		XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+	pxor		XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+	pxor		XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+
+%ifndef CBC_MAC
+	mov		OUT0,	[ARG + _aesarg_out + 8*0]
+	mov		OUT1,	[ARG + _aesarg_out + 8*1]
+	mov		OUT2,	[ARG + _aesarg_out + 8*2]
+	mov		OUT3,	[ARG + _aesarg_out + 8*3]
+%endif
+
+	pxor		XDATA0, [KEYS0 + 16*0]		; 0. ARK
+	pxor		XDATA1, [KEYS1 + 16*0]		; 0. ARK
+	pxor		XDATA2, [KEYS2 + 16*0]		; 0. ARK
+	pxor		XDATA3, [KEYS3 + 16*0]		; 0. ARK
+
+	aesenc		XDATA0, [KEYS0 + 16*1]	; 1. ENC
+	aesenc		XDATA1, [KEYS1 + 16*1]	; 1. ENC
+	aesenc		XDATA2, [KEYS2 + 16*1]	; 1. ENC
+	aesenc		XDATA3, [KEYS3 + 16*1]	; 1. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*2]	; 2. ENC
+	aesenc		XDATA1, [KEYS1 + 16*2]	; 2. ENC
+	aesenc		XDATA2, [KEYS2 + 16*2]	; 2. ENC
+	aesenc		XDATA3, [KEYS3 + 16*2]	; 2. ENC
+
+	movdqa		XKEY0_3, [KEYS0 + 16*3]	; load round 3 key
+	movdqa		XKEY1_3, [KEYS1 + 16*3]	; load round 3 key
+	movdqa		XKEY2_3, [KEYS2 + 16*3]	; load round 3 key
+	movdqa		XKEY3_3, [KEYS3 + 16*3]	; load round 3 key
+
+	aesenc		XDATA0, XKEY0_3		; 3. ENC
+	aesenc		XDATA1, XKEY1_3		; 3. ENC
+	aesenc		XDATA2, XKEY2_3		; 3. ENC
+	aesenc		XDATA3, XKEY3_3		; 3. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*4]	; 4. ENC
+	aesenc		XDATA1, [KEYS1 + 16*4]	; 4. ENC
+	aesenc		XDATA2, [KEYS2 + 16*4]	; 4. ENC
+	aesenc		XDATA3, [KEYS3 + 16*4]	; 4. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*5]	; 5. ENC
+	aesenc		XDATA1, [KEYS1 + 16*5]	; 5. ENC
+	aesenc		XDATA2, [KEYS2 + 16*5]	; 5. ENC
+	aesenc		XDATA3, [KEYS3 + 16*5]	; 5. ENC
+
+	movdqa		XKEY1_6, [KEYS1 + 16*6]	; load round 6 key
+	movdqa		XKEY2_6, [KEYS2 + 16*6]	; load round 6 key
+	movdqa		XKEY3_6, [KEYS3 + 16*6]	; load round 6 key
+
+	aesenc		XDATA0, XKEY0_6		; 6. ENC
+	aesenc		XDATA1, XKEY1_6		; 6. ENC
+	aesenc		XDATA2, XKEY2_6		; 6. ENC
+	aesenc		XDATA3, XKEY3_6		; 6. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*7]	; 7. ENC
+	aesenc		XDATA1, [KEYS1 + 16*7]	; 7. ENC
+	aesenc		XDATA2, [KEYS2 + 16*7]	; 7. ENC
+	aesenc		XDATA3, [KEYS3 + 16*7]	; 7. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*8]	; 8. ENC
+	aesenc		XDATA1, [KEYS1 + 16*8]	; 8. ENC
+	aesenc		XDATA2, [KEYS2 + 16*8]	; 8. ENC
+	aesenc		XDATA3, [KEYS3 + 16*8]	; 8. ENC
+
+	movdqa		XKEY0_9, [KEYS0 + 16*9]	; load round 9 key
+	movdqa		XKEY1_9, [KEYS1 + 16*9]	; load round 9 key
+	movdqa		XKEY2_9, [KEYS2 + 16*9]	; load round 9 key
+	movdqa		XKEY3_9, [KEYS3 + 16*9]	; load round 9 key
+
+	aesenc		XDATA0, XKEY0_9		; 9. ENC
+	aesenc		XDATA1, XKEY1_9		; 9. ENC
+	aesenc		XDATA2, XKEY2_9		; 9. ENC
+	aesenc		XDATA3, XKEY3_9		; 9. ENC
+
+	aesenclast	XDATA0, [KEYS0 + 16*10]	; 10. ENC
+	aesenclast	XDATA1, [KEYS1 + 16*10]	; 10. ENC
+	aesenclast	XDATA2, [KEYS2 + 16*10]	; 10. ENC
+	aesenclast	XDATA3, [KEYS3 + 16*10]	; 10. ENC
+
+%ifndef CBC_MAC
+	MOVDQ		[OUT0], XDATA0		; write back ciphertext
+	MOVDQ		[OUT1], XDATA1		; write back ciphertext
+	MOVDQ		[OUT2], XDATA2		; write back ciphertext
+	MOVDQ		[OUT3], XDATA3		; write back ciphertext
+%endif
+	cmp		LEN, IDX
+	je		done
+
+main_loop:
+	pxor2		XDATA0, [IN0 + IDX]	; plaintext XOR IV
+	pxor2		XDATA1, [IN1 + IDX]	; plaintext XOR IV
+	pxor2		XDATA2, [IN2 + IDX]	; plaintext XOR IV
+	pxor2		XDATA3, [IN3 + IDX]	; plaintext XOR IV
+
+	pxor		XDATA0, [KEYS0 + 16*0] 	; 0. ARK
+	pxor		XDATA1, [KEYS1 + 16*0] 	; 0. ARK
+	pxor		XDATA2, [KEYS2 + 16*0] 	; 0. ARK
+	pxor		XDATA3, [KEYS3 + 16*0] 	; 0. ARK
+
+	aesenc		XDATA0, [KEYS0 + 16*1]	; 1. ENC
+	aesenc		XDATA1, [KEYS1 + 16*1]	; 1. ENC
+	aesenc		XDATA2, [KEYS2 + 16*1]	; 1. ENC
+	aesenc		XDATA3, [KEYS3 + 16*1]	; 1. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*2]	; 2. ENC
+	aesenc		XDATA1, [KEYS1 + 16*2]	; 2. ENC
+	aesenc		XDATA2, [KEYS2 + 16*2]	; 2. ENC
+	aesenc		XDATA3, [KEYS3 + 16*2]	; 2. ENC
+
+	aesenc		XDATA0, XKEY0_3		; 3. ENC
+	aesenc		XDATA1, XKEY1_3		; 3. ENC
+	aesenc		XDATA2, XKEY2_3		; 3. ENC
+	aesenc		XDATA3, XKEY3_3		; 3. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*4]	; 4. ENC
+	aesenc		XDATA1, [KEYS1 + 16*4]	; 4. ENC
+	aesenc		XDATA2, [KEYS2 + 16*4]	; 4. ENC
+	aesenc		XDATA3, [KEYS3 + 16*4]	; 4. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*5]	; 5. ENC
+	aesenc		XDATA1, [KEYS1 + 16*5]	; 5. ENC
+	aesenc		XDATA2, [KEYS2 + 16*5]	; 5. ENC
+	aesenc		XDATA3, [KEYS3 + 16*5]	; 5. ENC
+
+	aesenc		XDATA0, XKEY0_6		; 6. ENC
+	aesenc		XDATA1, XKEY1_6		; 6. ENC
+	aesenc		XDATA2, XKEY2_6		; 6. ENC
+	aesenc		XDATA3, XKEY3_6		; 6. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*7]	; 7. ENC
+	aesenc		XDATA1, [KEYS1 + 16*7]	; 7. ENC
+	aesenc		XDATA2, [KEYS2 + 16*7]	; 7. ENC
+	aesenc		XDATA3, [KEYS3 + 16*7]	; 7. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*8]	; 8. ENC
+	aesenc		XDATA1, [KEYS1 + 16*8]	; 8. ENC
+	aesenc		XDATA2, [KEYS2 + 16*8]	; 8. ENC
+	aesenc		XDATA3, [KEYS3 + 16*8]	; 8. ENC
+
+	aesenc		XDATA0, XKEY0_9		; 9. ENC
+	aesenc		XDATA1, XKEY1_9		; 9. ENC
+	aesenc		XDATA2, XKEY2_9		; 9. ENC
+	aesenc		XDATA3, XKEY3_9		; 9. ENC
+
+	aesenclast	XDATA0, [KEYS0 + 16*10]	; 10. ENC
+	aesenclast	XDATA1, [KEYS1 + 16*10]	; 10. ENC
+	aesenclast	XDATA2, [KEYS2 + 16*10]	; 10. ENC
+	aesenclast	XDATA3, [KEYS3 + 16*10]	; 10. ENC
+
+%ifndef CBC_MAC
+        ;; No cipher text write back for CBC-MAC
+	MOVDQ		[OUT0 + IDX], XDATA0	; write back ciphertext
+	MOVDQ		[OUT1 + IDX], XDATA1	; write back ciphertext
+	MOVDQ		[OUT2 + IDX], XDATA2	; write back ciphertext
+	MOVDQ		[OUT3 + IDX], XDATA3	; write back ciphertext
+%endif
+
+	add	IDX, 16
+	cmp	LEN, IDX
+	jne	main_loop
+
+done:
+	;; update IV / store digest for CBC-MAC
+	movdqa	[ARG + _aesarg_IV + 16*0], XDATA0
+	movdqa	[ARG + _aesarg_IV + 16*1], XDATA1
+	movdqa	[ARG + _aesarg_IV + 16*2], XDATA2
+	movdqa	[ARG + _aesarg_IV + 16*3], XDATA3
+
+	;; update IN and OUT
+	add	IN0, LEN
+	mov	[ARG + _aesarg_in + 8*0], IN0
+	add	IN1, LEN
+	mov	[ARG + _aesarg_in + 8*1], IN1
+	add	IN2, LEN
+	mov	[ARG + _aesarg_in + 8*2], IN2
+	add	IN3, LEN
+	mov	[ARG + _aesarg_in + 8*3], IN3
+
+%ifndef CBC_MAC
+        ;; No OUT pointer updates for CBC-MAC
+	add	OUT0, LEN
+	mov	[ARG + _aesarg_out + 8*0], OUT0
+	add	OUT1, LEN
+	mov	[ARG + _aesarg_out + 8*1], OUT1
+	add	OUT2, LEN
+	mov	[ARG + _aesarg_out + 8*2], OUT2
+	add	OUT3, LEN
+	mov	[ARG + _aesarg_out + 8*3], OUT3
+%endif
+
+%ifdef CBC_MAC
+	mov	rbx, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+	mov	r13, [rsp + _gpr_save + 8*3]
+	mov	r14, [rsp + _gpr_save + 8*4]
+	mov	r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*6]
+	mov	rdi, [rsp + _gpr_save + 8*7]
+%endif
+%endif
+	mov	rbp, [rsp + _gpr_save + 8*0]
+	add	rsp, STACK_size
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm
new file mode 100644
index 000000000..c9f1cc3c5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm
@@ -0,0 +1,349 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;;; process 4 buffers at a time, single data structure as input
+;;; Updates In and Out pointers at end
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define	MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+	MOVDQ	XTMP, %2
+	pxor	%1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS {
+;;     void*    in[8];
+;;     void*    out[8];
+;;     UINT128* keys[8];
+;;     UINT128  IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_192_x4(AES_ARGS *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS structure
+;; arg 2: LEN : len (in units of bytes)
+
+%ifdef LINUX
+%define ARG	rdi
+%define LEN	rsi
+%define REG3	rcx
+%define REG4	rdx
+%else
+%define ARG	rcx
+%define LEN	rdx
+%define REG3	rsi
+%define REG4	rdi
+%endif
+
+%define IDX	rax
+
+%define IN0	r8
+%define KEYS0	rbx
+%define OUT0	r9
+
+%define IN1	r10
+%define KEYS1	REG3
+%define OUT1	r11
+
+%define IN2	r12
+%define KEYS2	REG4
+%define OUT2	r13
+
+%define IN3	r14
+%define KEYS3	rbp
+%define OUT3	r15
+
+
+%define XDATA0		xmm0
+%define XDATA1		xmm1
+%define XDATA2		xmm2
+%define XDATA3		xmm3
+
+%define XKEY0_3		xmm4
+%define XKEY0_6		[KEYS0 + 16*6]
+%define XTMP		xmm5
+%define XKEY0_9		xmm6
+
+%define XKEY1_3		xmm7
+%define XKEY1_6		xmm8
+%define XKEY1_9		xmm9
+
+%define XKEY2_3		xmm10
+%define XKEY2_6		xmm11
+%define XKEY2_9		xmm12
+
+%define XKEY3_3		xmm13
+%define XKEY3_6		xmm14
+%define XKEY3_9		xmm15
+
+%ifndef AES_CBC_ENC_X4
+%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4
+%endif
+
+section .text
+
+MKGLOBAL(AES_CBC_ENC_X4,function,internal)
+AES_CBC_ENC_X4:
+
+	push	rbp
+
+	mov	IDX, 16
+
+	mov	IN0,	[ARG + _aesarg_in + 8*0]
+	mov	IN1,	[ARG + _aesarg_in + 8*1]
+	mov	IN2,	[ARG + _aesarg_in + 8*2]
+	mov	IN3,	[ARG + _aesarg_in + 8*3]
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+	MOVDQ		XDATA0, [IN0]		; load first block of plain text
+	MOVDQ		XDATA1, [IN1]		; load first block of plain text
+	MOVDQ		XDATA2, [IN2]		; load first block of plain text
+	MOVDQ		XDATA3, [IN3]		; load first block of plain text
+
+	mov		KEYS0,	[ARG + _aesarg_keys + 8*0]
+	mov		KEYS1,	[ARG + _aesarg_keys + 8*1]
+	mov		KEYS2,	[ARG + _aesarg_keys + 8*2]
+	mov		KEYS3,	[ARG + _aesarg_keys + 8*3]
+
+	pxor		XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+	pxor		XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+	pxor		XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+	pxor		XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+
+	mov		OUT0,	[ARG + _aesarg_out + 8*0]
+	mov		OUT1,	[ARG + _aesarg_out + 8*1]
+	mov		OUT2,	[ARG + _aesarg_out + 8*2]
+	mov		OUT3,	[ARG + _aesarg_out + 8*3]
+
+	pxor		XDATA0, [KEYS0 + 16*0]		; 0. ARK
+	pxor		XDATA1, [KEYS1 + 16*0]		; 0. ARK
+	pxor		XDATA2, [KEYS2 + 16*0]		; 0. ARK
+	pxor		XDATA3, [KEYS3 + 16*0]		; 0. ARK
+
+	aesenc		XDATA0, [KEYS0 + 16*1]	; 1. ENC
+	aesenc		XDATA1, [KEYS1 + 16*1]	; 1. ENC
+	aesenc		XDATA2, [KEYS2 + 16*1]	; 1. ENC
+	aesenc		XDATA3, [KEYS3 + 16*1]	; 1. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*2]	; 2. ENC
+	aesenc		XDATA1, [KEYS1 + 16*2]	; 2. ENC
+	aesenc		XDATA2, [KEYS2 + 16*2]	; 2. ENC
+	aesenc		XDATA3, [KEYS3 + 16*2]	; 2. ENC
+
+	movdqa		XKEY0_3, [KEYS0 + 16*3]	; load round 3 key
+	movdqa		XKEY1_3, [KEYS1 + 16*3]	; load round 3 key
+	movdqa		XKEY2_3, [KEYS2 + 16*3]	; load round 3 key
+	movdqa		XKEY3_3, [KEYS3 + 16*3]	; load round 3 key
+
+	aesenc		XDATA0, XKEY0_3		; 3. ENC
+	aesenc		XDATA1, XKEY1_3		; 3. ENC
+	aesenc		XDATA2, XKEY2_3		; 3. ENC
+	aesenc		XDATA3, XKEY3_3		; 3. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*4]	; 4. ENC
+	aesenc		XDATA1, [KEYS1 + 16*4]	; 4. ENC
+	aesenc		XDATA2, [KEYS2 + 16*4]	; 4. ENC
+	aesenc		XDATA3, [KEYS3 + 16*4]	; 4. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*5]	; 5. ENC
+	aesenc		XDATA1, [KEYS1 + 16*5]	; 5. ENC
+	aesenc		XDATA2, [KEYS2 + 16*5]	; 5. ENC
+	aesenc		XDATA3, [KEYS3 + 16*5]	; 5. ENC
+
+	movdqa		XKEY1_6, [KEYS1 + 16*6]	; load round 6 key
+	movdqa		XKEY2_6, [KEYS2 + 16*6]	; load round 6 key
+	movdqa		XKEY3_6, [KEYS3 + 16*6]	; load round 6 key
+
+	aesenc		XDATA0, XKEY0_6		; 6. ENC
+	aesenc		XDATA1, XKEY1_6		; 6. ENC
+	aesenc		XDATA2, XKEY2_6		; 6. ENC
+	aesenc		XDATA3, XKEY3_6		; 6. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*7]	; 7. ENC
+	aesenc		XDATA1, [KEYS1 + 16*7]	; 7. ENC
+	aesenc		XDATA2, [KEYS2 + 16*7]	; 7. ENC
+	aesenc		XDATA3, [KEYS3 + 16*7]	; 7. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*8]	; 8. ENC
+	aesenc		XDATA1, [KEYS1 + 16*8]	; 8. ENC
+	aesenc		XDATA2, [KEYS2 + 16*8]	; 8. ENC
+	aesenc		XDATA3, [KEYS3 + 16*8]	; 8. ENC
+
+	movdqa		XKEY0_9, [KEYS0 + 16*9]	; load round 9 key
+	movdqa		XKEY1_9, [KEYS1 + 16*9]	; load round 9 key
+	movdqa		XKEY2_9, [KEYS2 + 16*9]	; load round 9 key
+	movdqa		XKEY3_9, [KEYS3 + 16*9]	; load round 9 key
+
+	aesenc		XDATA0, XKEY0_9		; 9. ENC
+	aesenc		XDATA1, XKEY1_9		; 9. ENC
+	aesenc		XDATA2, XKEY2_9		; 9. ENC
+	aesenc		XDATA3, XKEY3_9		; 9. ENC
+
+	aesenc    	XDATA0, [KEYS0 + 16*10]	; 10. ENC
+	aesenc    	XDATA1, [KEYS1 + 16*10]	; 10. ENC
+	aesenc    	XDATA2, [KEYS2 + 16*10]	; 10. ENC
+	aesenc    	XDATA3, [KEYS3 + 16*10]	; 10. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*11]	; 11. ENC
+	aesenc		XDATA1, [KEYS1 + 16*11]	; 11. ENC
+	aesenc		XDATA2, [KEYS2 + 16*11]	; 11. ENC
+	aesenc		XDATA3, [KEYS3 + 16*11]	; 11. ENC
+
+	aesenclast		XDATA0, [KEYS0 + 16*12]	; 12. ENC
+	aesenclast		XDATA1, [KEYS1 + 16*12]	; 12. ENC
+	aesenclast		XDATA2, [KEYS2 + 16*12]	; 12. ENC
+	aesenclast		XDATA3, [KEYS3 + 16*12]	; 12. ENC
+
+	MOVDQ		[OUT0], XDATA0		; write back ciphertext
+	MOVDQ		[OUT1], XDATA1		; write back ciphertext
+	MOVDQ		[OUT2], XDATA2		; write back ciphertext
+	MOVDQ		[OUT3], XDATA3		; write back ciphertext
+
+	cmp		LEN, IDX
+	je		done
+
+main_loop:
+	pxor2		XDATA0, [IN0 + IDX]	; plaintext XOR IV
+	pxor2		XDATA1, [IN1 + IDX]	; plaintext XOR IV
+	pxor2		XDATA2, [IN2 + IDX]	; plaintext XOR IV
+	pxor2		XDATA3, [IN3 + IDX]	; plaintext XOR IV
+
+
+	pxor		XDATA0, [KEYS0 + 16*0] 	; 0. ARK
+	pxor		XDATA1, [KEYS1 + 16*0] 	; 0. ARK
+	pxor		XDATA2, [KEYS2 + 16*0] 	; 0. ARK
+	pxor		XDATA3, [KEYS3 + 16*0] 	; 0. ARK
+
+	aesenc		XDATA0, [KEYS0 + 16*1]	; 1. ENC
+	aesenc		XDATA1, [KEYS1 + 16*1]	; 1. ENC
+	aesenc		XDATA2, [KEYS2 + 16*1]	; 1. ENC
+	aesenc		XDATA3, [KEYS3 + 16*1]	; 1. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*2]	; 2. ENC
+	aesenc		XDATA1, [KEYS1 + 16*2]	; 2. ENC
+	aesenc		XDATA2, [KEYS2 + 16*2]	; 2. ENC
+	aesenc		XDATA3, [KEYS3 + 16*2]	; 2. ENC
+
+	aesenc		XDATA0, XKEY0_3		; 3. ENC
+	aesenc		XDATA1, XKEY1_3		; 3. ENC
+	aesenc		XDATA2, XKEY2_3		; 3. ENC
+	aesenc		XDATA3, XKEY3_3		; 3. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*4]	; 4. ENC
+	aesenc		XDATA1, [KEYS1 + 16*4]	; 4. ENC
+	aesenc		XDATA2, [KEYS2 + 16*4]	; 4. ENC
+	aesenc		XDATA3, [KEYS3 + 16*4]	; 4. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*5]	; 5. ENC
+	aesenc		XDATA1, [KEYS1 + 16*5]	; 5. ENC
+	aesenc		XDATA2, [KEYS2 + 16*5]	; 5. ENC
+	aesenc		XDATA3, [KEYS3 + 16*5]	; 5. ENC
+
+	aesenc		XDATA0, XKEY0_6		; 6. ENC
+	aesenc		XDATA1, XKEY1_6		; 6. ENC
+	aesenc		XDATA2, XKEY2_6		; 6. ENC
+	aesenc		XDATA3, XKEY3_6		; 6. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*7]	; 7. ENC
+	aesenc		XDATA1, [KEYS1 + 16*7]	; 7. ENC
+	aesenc		XDATA2, [KEYS2 + 16*7]	; 7. ENC
+	aesenc		XDATA3, [KEYS3 + 16*7]	; 7. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*8]	; 8. ENC
+	aesenc		XDATA1, [KEYS1 + 16*8]	; 8. ENC
+	aesenc		XDATA2, [KEYS2 + 16*8]	; 8. ENC
+	aesenc		XDATA3, [KEYS3 + 16*8]	; 8. ENC
+
+	aesenc		XDATA0, XKEY0_9		; 9. ENC
+	aesenc		XDATA1, XKEY1_9		; 9. ENC
+	aesenc		XDATA2, XKEY2_9		; 9. ENC
+	aesenc		XDATA3, XKEY3_9		; 9. ENC
+
+	aesenc    	XDATA0, [KEYS0 + 16*10]	; 10. ENC
+	aesenc    	XDATA1, [KEYS1 + 16*10]	; 10. ENC
+	aesenc    	XDATA2, [KEYS2 + 16*10]	; 10. ENC
+	aesenc    	XDATA3, [KEYS3 + 16*10]	; 10. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*11]	; 11. ENC
+	aesenc		XDATA1, [KEYS1 + 16*11]	; 11. ENC
+	aesenc		XDATA2, [KEYS2 + 16*11]	; 11. ENC
+	aesenc		XDATA3, [KEYS3 + 16*11]	; 11. ENC
+
+	aesenclast		XDATA0, [KEYS0 + 16*12]	; 12. ENC
+	aesenclast		XDATA1, [KEYS1 + 16*12]	; 12. ENC
+	aesenclast		XDATA2, [KEYS2 + 16*12]	; 12. ENC
+	aesenclast		XDATA3, [KEYS3 + 16*12]	; 12. ENC
+
+
+
+	MOVDQ		[OUT0 + IDX], XDATA0	; write back ciphertext
+	MOVDQ		[OUT1 + IDX], XDATA1	; write back ciphertex
+	MOVDQ		[OUT2 + IDX], XDATA2	; write back ciphertex
+	MOVDQ		[OUT3 + IDX], XDATA3	; write back ciphertex
+
+
+	add	IDX, 16
+	cmp	LEN, IDX
+	jne	main_loop
+
+done:
+	;; update IV
+	movdqa	[ARG + _aesarg_IV + 16*0], XDATA0
+	movdqa	[ARG + _aesarg_IV + 16*1], XDATA1
+	movdqa	[ARG + _aesarg_IV + 16*2], XDATA2
+	movdqa	[ARG + _aesarg_IV + 16*3], XDATA3
+
+	;; update IN and OUT
+	add	IN0, LEN
+	mov	[ARG + _aesarg_in + 8*0], IN0
+	add	IN1, LEN
+	mov	[ARG + _aesarg_in + 8*1], IN1
+	add	IN2, LEN
+	mov	[ARG + _aesarg_in + 8*2], IN2
+	add	IN3, LEN
+	mov	[ARG + _aesarg_in + 8*3], IN3
+
+	add	OUT0, LEN
+	mov	[ARG + _aesarg_out + 8*0], OUT0
+	add	OUT1, LEN
+	mov	[ARG + _aesarg_out + 8*1], OUT1
+	add	OUT2, LEN
+	mov	[ARG + _aesarg_out + 8*2], OUT2
+	add	OUT3, LEN
+	mov	[ARG + _aesarg_out + 8*3], OUT3
+
+	pop	rbp
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm
new file mode 100644
index 000000000..e51f4caac
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm
@@ -0,0 +1,368 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;;; process 4 buffers at a time, single data structure as input
+;;; Updates In and Out pointers at end
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define	MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+	MOVDQ	XTMP, %2
+	pxor	%1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS {
+;;     void*    in[8];
+;;     void*    out[8];
+;;     UINT128* keys[8];
+;;     UINT128  IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_256_x4(AES_ARGS *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS structure
+;; arg 2: LEN : len (in units of bytes)
+
+%ifdef LINUX
+%define ARG	rdi
+%define LEN	rsi
+%define REG3	rcx
+%define REG4	rdx
+%else
+%define ARG	rcx
+%define LEN	rdx
+%define REG3	rsi
+%define REG4	rdi
+%endif
+
+%define IDX	rax
+
+%define IN0	r8
+%define KEYS0	rbx
+%define OUT0	r9
+
+%define IN1	r10
+%define KEYS1	REG3
+%define OUT1	r11
+
+%define IN2	r12
+%define KEYS2	REG4
+%define OUT2	r13
+
+%define IN3	r14
+%define KEYS3	rbp
+%define OUT3	r15
+
+
+%define XDATA0		xmm0
+%define XDATA1		xmm1
+%define XDATA2		xmm2
+%define XDATA3		xmm3
+
+%define XKEY0_3		xmm4
+%define XKEY0_6		[KEYS0 + 16*6]
+%define XTMP		xmm5
+%define XKEY0_9		xmm6
+
+%define XKEY1_3		xmm7
+%define XKEY1_6		xmm8
+%define XKEY1_9		xmm9
+
+%define XKEY2_3		xmm10
+%define XKEY2_6		xmm11
+%define XKEY2_9		xmm12
+
+%define XKEY3_3		xmm13
+%define XKEY3_6		xmm14
+%define XKEY3_9		xmm15
+
+%ifndef AES_CBC_ENC_X4
+%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4
+%endif
+
+section .text
+
+MKGLOBAL(AES_CBC_ENC_X4,function,internal)
+AES_CBC_ENC_X4:
+
+	push	rbp
+
+	mov	IDX, 16
+
+	mov	IN0,	[ARG + _aesarg_in + 8*0]
+	mov	IN1,	[ARG + _aesarg_in + 8*1]
+	mov	IN2,	[ARG + _aesarg_in + 8*2]
+	mov	IN3,	[ARG + _aesarg_in + 8*3]
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+	MOVDQ		XDATA0, [IN0]		; load first block of plain text
+	MOVDQ		XDATA1, [IN1]		; load first block of plain text
+	MOVDQ		XDATA2, [IN2]		; load first block of plain text
+	MOVDQ		XDATA3, [IN3]		; load first block of plain text
+
+	mov		KEYS0,	[ARG + _aesarg_keys + 8*0]
+	mov		KEYS1,	[ARG + _aesarg_keys + 8*1]
+	mov		KEYS2,	[ARG + _aesarg_keys + 8*2]
+	mov		KEYS3,	[ARG + _aesarg_keys + 8*3]
+
+	pxor		XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+	pxor		XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+	pxor		XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+	pxor		XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+
+	mov		OUT0,	[ARG + _aesarg_out + 8*0]
+	mov		OUT1,	[ARG + _aesarg_out + 8*1]
+	mov		OUT2,	[ARG + _aesarg_out + 8*2]
+	mov		OUT3,	[ARG + _aesarg_out + 8*3]
+
+	pxor		XDATA0, [KEYS0 + 16*0]		; 0. ARK
+	pxor		XDATA1, [KEYS1 + 16*0]		; 0. ARK
+	pxor		XDATA2, [KEYS2 + 16*0]		; 0. ARK
+	pxor		XDATA3, [KEYS3 + 16*0]		; 0. ARK
+
+	aesenc		XDATA0, [KEYS0 + 16*1]	; 1. ENC
+	aesenc		XDATA1, [KEYS1 + 16*1]	; 1. ENC
+	aesenc		XDATA2, [KEYS2 + 16*1]	; 1. ENC
+	aesenc		XDATA3, [KEYS3 + 16*1]	; 1. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*2]	; 2. ENC
+	aesenc		XDATA1, [KEYS1 + 16*2]	; 2. ENC
+	aesenc		XDATA2, [KEYS2 + 16*2]	; 2. ENC
+	aesenc		XDATA3, [KEYS3 + 16*2]	; 2. ENC
+
+	movdqa		XKEY0_3, [KEYS0 + 16*3]	; load round 3 key
+	movdqa		XKEY1_3, [KEYS1 + 16*3]	; load round 3 key
+	movdqa		XKEY2_3, [KEYS2 + 16*3]	; load round 3 key
+	movdqa		XKEY3_3, [KEYS3 + 16*3]	; load round 3 key
+
+	aesenc		XDATA0, XKEY0_3		; 3. ENC
+	aesenc		XDATA1, XKEY1_3		; 3. ENC
+	aesenc		XDATA2, XKEY2_3		; 3. ENC
+	aesenc		XDATA3, XKEY3_3		; 3. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*4]	; 4. ENC
+	aesenc		XDATA1, [KEYS1 + 16*4]	; 4. ENC
+	aesenc		XDATA2, [KEYS2 + 16*4]	; 4. ENC
+	aesenc		XDATA3, [KEYS3 + 16*4]	; 4. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*5]	; 5. ENC
+	aesenc		XDATA1, [KEYS1 + 16*5]	; 5. ENC
+	aesenc		XDATA2, [KEYS2 + 16*5]	; 5. ENC
+	aesenc		XDATA3, [KEYS3 + 16*5]	; 5. ENC
+
+	movdqa		XKEY1_6, [KEYS1 + 16*6]	; load round 6 key
+	movdqa		XKEY2_6, [KEYS2 + 16*6]	; load round 6 key
+	movdqa		XKEY3_6, [KEYS3 + 16*6]	; load round 6 key
+
+	aesenc		XDATA0, XKEY0_6		; 6. ENC
+	aesenc		XDATA1, XKEY1_6		; 6. ENC
+	aesenc		XDATA2, XKEY2_6		; 6. ENC
+	aesenc		XDATA3, XKEY3_6		; 6. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*7]	; 7. ENC
+	aesenc		XDATA1, [KEYS1 + 16*7]	; 7. ENC
+	aesenc		XDATA2, [KEYS2 + 16*7]	; 7. ENC
+	aesenc		XDATA3, [KEYS3 + 16*7]	; 7. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*8]	; 8. ENC
+	aesenc		XDATA1, [KEYS1 + 16*8]	; 8. ENC
+	aesenc		XDATA2, [KEYS2 + 16*8]	; 8. ENC
+	aesenc		XDATA3, [KEYS3 + 16*8]	; 8. ENC
+
+	movdqa		XKEY0_9, [KEYS0 + 16*9]	; load round 9 key
+	movdqa		XKEY1_9, [KEYS1 + 16*9]	; load round 9 key
+	movdqa		XKEY2_9, [KEYS2 + 16*9]	; load round 9 key
+	movdqa		XKEY3_9, [KEYS3 + 16*9]	; load round 9 key
+
+	aesenc		XDATA0, XKEY0_9		; 9. ENC
+	aesenc		XDATA1, XKEY1_9		; 9. ENC
+	aesenc		XDATA2, XKEY2_9		; 9. ENC
+	aesenc		XDATA3, XKEY3_9		; 9. ENC
+
+	aesenc    	XDATA0, [KEYS0 + 16*10]	; 10. ENC
+	aesenc    	XDATA1, [KEYS1 + 16*10]	; 10. ENC
+	aesenc    	XDATA2, [KEYS2 + 16*10]	; 10. ENC
+	aesenc    	XDATA3, [KEYS3 + 16*10]	; 10. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*11]	; 11. ENC
+	aesenc		XDATA1, [KEYS1 + 16*11]	; 11. ENC
+	aesenc		XDATA2, [KEYS2 + 16*11]	; 11. ENC
+	aesenc		XDATA3, [KEYS3 + 16*11]	; 11. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*12]	; 12. ENC
+	aesenc		XDATA1, [KEYS1 + 16*12]	; 12. ENC
+	aesenc		XDATA2, [KEYS2 + 16*12]	; 12. ENC
+	aesenc		XDATA3, [KEYS3 + 16*12]	; 12. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*13]	; 13. ENC
+	aesenc		XDATA1, [KEYS1 + 16*13]	; 13. ENC
+	aesenc		XDATA2, [KEYS2 + 16*13]	; 13. ENC
+	aesenc		XDATA3, [KEYS3 + 16*13]	; 13. ENC
+
+	aesenclast	XDATA0, [KEYS0 + 16*14]	; 14. ENC
+	aesenclast	XDATA1, [KEYS1 + 16*14]	; 14. ENC
+	aesenclast	XDATA2, [KEYS2 + 16*14]	; 14. ENC
+	aesenclast	XDATA3, [KEYS3 + 16*14]	; 14. ENC
+
+	MOVDQ		[OUT0], XDATA0		; write back ciphertext
+	MOVDQ		[OUT1], XDATA1		; write back ciphertext
+	MOVDQ		[OUT2], XDATA2		; write back ciphertext
+	MOVDQ		[OUT3], XDATA3		; write back ciphertext
+
+	cmp		LEN, IDX
+	je		done
+
+main_loop:
+	pxor2		XDATA0, [IN0 + IDX]	; plaintext XOR IV
+	pxor2		XDATA1, [IN1 + IDX]	; plaintext XOR IV
+	pxor2		XDATA2, [IN2 + IDX]	; plaintext XOR IV
+	pxor2		XDATA3, [IN3 + IDX]	; plaintext XOR IV
+
+
+	pxor		XDATA0, [KEYS0 + 16*0] 	; 0. ARK
+	pxor		XDATA1, [KEYS1 + 16*0] 	; 0. ARK
+	pxor		XDATA2, [KEYS2 + 16*0] 	; 0. ARK
+	pxor		XDATA3, [KEYS3 + 16*0] 	; 0. ARK
+
+	aesenc		XDATA0, [KEYS0 + 16*1]	; 1. ENC
+	aesenc		XDATA1, [KEYS1 + 16*1]	; 1. ENC
+	aesenc		XDATA2, [KEYS2 + 16*1]	; 1. ENC
+	aesenc		XDATA3, [KEYS3 + 16*1]	; 1. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*2]	; 2. ENC
+	aesenc		XDATA1, [KEYS1 + 16*2]	; 2. ENC
+	aesenc		XDATA2, [KEYS2 + 16*2]	; 2. ENC
+	aesenc		XDATA3, [KEYS3 + 16*2]	; 2. ENC
+
+	aesenc		XDATA0, XKEY0_3		; 3. ENC
+	aesenc		XDATA1, XKEY1_3		; 3. ENC
+	aesenc		XDATA2, XKEY2_3		; 3. ENC
+	aesenc		XDATA3, XKEY3_3		; 3. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*4]	; 4. ENC
+	aesenc		XDATA1, [KEYS1 + 16*4]	; 4. ENC
+	aesenc		XDATA2, [KEYS2 + 16*4]	; 4. ENC
+	aesenc		XDATA3, [KEYS3 + 16*4]	; 4. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*5]	; 5. ENC
+	aesenc		XDATA1, [KEYS1 + 16*5]	; 5. ENC
+	aesenc		XDATA2, [KEYS2 + 16*5]	; 5. ENC
+	aesenc		XDATA3, [KEYS3 + 16*5]	; 5. ENC
+
+	aesenc		XDATA0, XKEY0_6		; 6. ENC
+	aesenc		XDATA1, XKEY1_6		; 6. ENC
+	aesenc		XDATA2, XKEY2_6		; 6. ENC
+	aesenc		XDATA3, XKEY3_6		; 6. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*7]	; 7. ENC
+	aesenc		XDATA1, [KEYS1 + 16*7]	; 7. ENC
+	aesenc		XDATA2, [KEYS2 + 16*7]	; 7. ENC
+	aesenc		XDATA3, [KEYS3 + 16*7]	; 7. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*8]	; 8. ENC
+	aesenc		XDATA1, [KEYS1 + 16*8]	; 8. ENC
+	aesenc		XDATA2, [KEYS2 + 16*8]	; 8. ENC
+	aesenc		XDATA3, [KEYS3 + 16*8]	; 8. ENC
+
+	aesenc		XDATA0, XKEY0_9		; 9. ENC
+	aesenc		XDATA1, XKEY1_9		; 9. ENC
+	aesenc		XDATA2, XKEY2_9		; 9. ENC
+	aesenc		XDATA3, XKEY3_9		; 9. ENC
+
+	aesenc    	XDATA0, [KEYS0 + 16*10]	; 10. ENC
+	aesenc    	XDATA1, [KEYS1 + 16*10]	; 10. ENC
+	aesenc    	XDATA2, [KEYS2 + 16*10]	; 10. ENC
+	aesenc    	XDATA3, [KEYS3 + 16*10]	; 10. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*11]	; 11. ENC
+	aesenc		XDATA1, [KEYS1 + 16*11]	; 11. ENC
+	aesenc		XDATA2, [KEYS2 + 16*11]	; 11. ENC
+	aesenc		XDATA3, [KEYS3 + 16*11]	; 11. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*12]	; 12. ENC
+	aesenc		XDATA1, [KEYS1 + 16*12]	; 12. ENC
+	aesenc		XDATA2, [KEYS2 + 16*12]	; 12. ENC
+	aesenc		XDATA3, [KEYS3 + 16*12]	; 12. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*13]	; 13. ENC
+	aesenc		XDATA1, [KEYS1 + 16*13]	; 13. ENC
+	aesenc		XDATA2, [KEYS2 + 16*13]	; 13. ENC
+	aesenc		XDATA3, [KEYS3 + 16*13]	; 13. ENC
+
+	aesenclast	XDATA0, [KEYS0 + 16*14]	; 14. ENC
+	aesenclast	XDATA1, [KEYS1 + 16*14]	; 14. ENC
+	aesenclast	XDATA2, [KEYS2 + 16*14]	; 14. ENC
+	aesenclast	XDATA3, [KEYS3 + 16*14]	; 14. ENC
+
+
+	MOVDQ		[OUT0 + IDX], XDATA0	; write back ciphertext
+	MOVDQ		[OUT1 + IDX], XDATA1	; write back ciphertex
+	MOVDQ		[OUT2 + IDX], XDATA2	; write back ciphertex
+	MOVDQ		[OUT3 + IDX], XDATA3	; write back ciphertex
+
+
+	add	IDX, 16
+	cmp	LEN, IDX
+	jne	main_loop
+
+done:
+	;; update IV
+	movdqa	[ARG + _aesarg_IV + 16*0], XDATA0
+	movdqa	[ARG + _aesarg_IV + 16*1], XDATA1
+	movdqa	[ARG + _aesarg_IV + 16*2], XDATA2
+	movdqa	[ARG + _aesarg_IV + 16*3], XDATA3
+
+	;; update IN and OUT
+	add	IN0, LEN
+	mov	[ARG + _aesarg_in + 8*0], IN0
+	add	IN1, LEN
+	mov	[ARG + _aesarg_in + 8*1], IN1
+	add	IN2, LEN
+	mov	[ARG + _aesarg_in + 8*2], IN2
+	add	IN3, LEN
+	mov	[ARG + _aesarg_in + 8*3], IN3
+
+	add	OUT0, LEN
+	mov	[ARG + _aesarg_out + 8*0], OUT0
+	add	OUT1, LEN
+	mov	[ARG + _aesarg_out + 8*1], OUT1
+	add	OUT2, LEN
+	mov	[ARG + _aesarg_out + 8*2], OUT2
+	add	OUT3, LEN
+	mov	[ARG + _aesarg_out + 8*3], OUT3
+
+	pop	rbp
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm
new file mode 100644
index 000000000..1ee400bb4
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm
@@ -0,0 +1,167 @@
+;;
+;; Copyright (c) 2017-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/memcpy.asm"
+%include "include/clear_regs.asm"
+
+;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only.
+;;; It processes only one buffer at a time.
+;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI
+
+;; In System V AMD64 ABI
+;;	calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;;	calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers:		RAX RBX RCX RDX RBP RSI RDI R8  R9  R10 R11 R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Windows clobbers:	RAX                             R9  R10 R11
+;; Windows preserves:	    RBX RCX RDX RBP RSI RDI R8              R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Linux clobbers:	RAX                             R9  R10
+;; Linux preserves:	    RBX RCX RDX RBP RSI RDI R8          R11 R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0
+;;
+
+%ifndef AES_CFB_128_ONE
+%define AES_CFB_128_ONE aes_cfb_128_one_sse
+%endif
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define arg3	rdx
+%define arg4	rcx
+%define arg5	r8
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define arg3	r8
+%define arg4	r9
+%define arg5	[rsp + 5*8]
+%endif
+
+%define OUT	arg1
+%define IN	arg2
+%define IV	arg3
+%define KEYS	arg4
+%ifdef LINUX
+%define LEN	arg5
+%else
+%define LEN2	arg5
+%define LEN	r11
+%endif
+
+%define TMP0	rax
+%define TMP1	r10
+
+%define XDATA	xmm0
+%define XIN	xmm1
+
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys, uint64_t len)
+;; arg 1: OUT : addr to put clear/cipher text out
+;; arg 2: IN  : addr to take cipher/clear text from
+;; arg 3: IV  : initialization vector
+;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned)
+;; arg 5: LEN:  length of the text to encrypt/decrypt (valid range is 0 to 16)
+;;
+;; AES CFB128 one block encrypt/decrypt implementation.
+;; The function doesn't update IV. The result of operation can be found in OUT.
+;;
+;; It is primarly designed to process partial block of
+;; DOCSIS 3.1 AES Packet PDU Encryption (I.10)
+;;
+;; It process up to one block only (up to 16 bytes).
+;;
+;; It makes sure not to read more than LEN bytes from IN and
+;; not to store more than LEN bytes to OUT.
+
+MKGLOBAL(AES_CFB_128_ONE,function,)
+align 32
+AES_CFB_128_ONE:
+%ifndef LINUX
+	mov		LEN, LEN2
+%endif
+%ifdef SAFE_PARAM
+        cmp             IV, 0
+        jz              exit_cfb
+
+        cmp             KEYS, 0
+        jz              exit_cfb
+
+        cmp             LEN, 0
+        jz              skip_in_out_check
+
+        cmp             OUT, 0
+        jz              exit_cfb
+
+        cmp             IN, 0
+        jz              exit_cfb
+
+skip_in_out_check:
+%endif
+
+	simd_load_sse_16 XIN, IN, LEN
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+	movdqu		XDATA, [IV] 		; IV (or next to last block)
+	pxor		XDATA, [KEYS + 16*0]	; 0. ARK
+	aesenc		XDATA, [KEYS + 16*1]	; 1. ENC
+	aesenc		XDATA, [KEYS + 16*2]	; 2. ENC
+	aesenc		XDATA, [KEYS + 16*3]	; 3. ENC
+	aesenc		XDATA, [KEYS + 16*4]	; 4. ENC
+	aesenc		XDATA, [KEYS + 16*5]	; 5. ENC
+	aesenc		XDATA, [KEYS + 16*6]	; 6. ENC
+	aesenc		XDATA, [KEYS + 16*7]	; 7. ENC
+	aesenc		XDATA, [KEYS + 16*8]	; 8. ENC
+	aesenc		XDATA, [KEYS + 16*9]	; 9. ENC
+	aesenclast	XDATA, [KEYS + 16*10]	; 10. ENC
+
+	pxor		XDATA, XIN 		; plaintext/ciphertext XOR block cipher encryption
+
+	simd_store_sse	OUT, XDATA, LEN, TMP0, TMP1
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifdef SAFE_DATA
+        ;; XDATA and XIN are the only scratch SIMD registers used
+        clear_xmms_sse  XDATA, XIN
+        clear_scratch_gps_asm
+%endif
+exit_cfb:
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm
new file mode 100644
index 000000000..c4b767932
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm
@@ -0,0 +1,654 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES ECB encrypt/decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_ecb_x_y_sse(void    *in,
+;                      UINT128  keys[],
+;                      void    *out,
+;                      UINT64   len_bytes);
+;
+; x = direction (enc/dec)
+; y = key size (128/192/256)
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: KEYS: pointer to keys
+; arg 3: OUT:  pointer to output (plain text)
+; arg 4: LEN:  length in bytes (multiple of 16)
+;
+
+%include "include/os.asm"
+
+%ifndef AES_ECB_ENC_128
+%define AES_ECB_ENC_128 aes_ecb_enc_128_sse
+%define AES_ECB_ENC_192 aes_ecb_enc_192_sse
+%define AES_ECB_ENC_256 aes_ecb_enc_256_sse
+%define AES_ECB_DEC_128 aes_ecb_dec_128_sse
+%define AES_ECB_DEC_192 aes_ecb_dec_192_sse
+%define AES_ECB_DEC_256 aes_ecb_dec_256_sse
+%endif
+
+%ifdef LINUX
+%define IN		rdi
+%define KEYS		rsi
+%define OUT		rdx
+%define LEN		rcx
+%else
+%define IN		rcx
+%define KEYS		rdx
+%define OUT		r8
+%define LEN		r9
+%endif
+
+%define IDX		rax
+%define TMP		IDX
+%define XDATA0		xmm0
+%define XDATA1		xmm1
+%define XDATA2		xmm2
+%define XDATA3		xmm3
+%define XKEY0		xmm4
+%define XKEY2		xmm5
+%define XKEY4		xmm6
+%define XKEY6		xmm7
+%define XKEY10		xmm8
+%define XKEY_A		xmm14
+%define XKEY_B		xmm15
+
+section .text
+
+%macro AES_ECB 2
+%define %%NROUNDS %1 ; [in] Number of AES rounds, numerical value
+%define %%DIR     %2 ; [in] Direction (encrypt/decrypt)
+
+%ifidn %%DIR, ENC
+%define AES      aesenc
+%define AES_LAST aesenclast
+%else ; DIR = DEC
+%define AES      aesdec
+%define AES_LAST aesdeclast
+%endif
+	mov	TMP, LEN
+	and	TMP, 3*16
+	jz	%%initial_4
+	cmp	TMP, 2*16
+	jb	%%initial_1
+	ja	%%initial_3
+
+%%initial_2:
+	; load plain/cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	AES	XDATA0, [KEYS + 1*16]	; 1. ENC
+	AES	XDATA1, [KEYS + 1*16]
+
+	mov	IDX, 2*16
+
+	AES	XDATA0, XKEY2		; 2. ENC
+	AES	XDATA1, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	AES	XDATA0, [KEYS + 3*16]	; 3. ENC
+	AES	XDATA1, [KEYS + 3*16]
+
+	AES	XDATA0, XKEY4		; 4. ENC
+	AES	XDATA1, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	AES	XDATA0, [KEYS + 5*16]	; 5. ENC
+	AES	XDATA1, [KEYS + 5*16]
+
+	AES	XDATA0, XKEY6		; 6. ENC
+	AES	XDATA1, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	AES	XDATA0, [KEYS + 7*16]	; 7. ENC
+	AES	XDATA1, [KEYS + 7*16]
+
+	AES	XDATA0, XKEY_B		; 8. ENC
+	AES	XDATA1, XKEY_B
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	AES	XDATA0, [KEYS + 9*16]	; 9. ENC
+	AES	XDATA1, [KEYS + 9*16]
+
+%if %%NROUNDS >= 12
+	AES	XDATA0, XKEY10		; 10. ENC
+	AES	XDATA1, XKEY10
+
+	AES	XDATA0, [KEYS + 11*16]	; 11. ENC
+	AES	XDATA1, [KEYS + 11*16]
+%endif
+
+%if %%NROUNDS == 14
+	AES	XDATA0, [KEYS + 12*16]	; 12. ENC
+	AES	XDATA1, [KEYS + 12*16]
+
+	AES	XDATA0, [KEYS + 13*16]	; 13. ENC
+	AES	XDATA1, [KEYS + 13*16]
+%endif
+
+%if %%NROUNDS == 10
+	AES_LAST	XDATA0, XKEY10	; 10. ENC
+	AES_LAST	XDATA1, XKEY10
+%elif %%NROUNDS == 12
+	AES_LAST	XDATA0, [KEYS + 12*16]	; 12. ENC
+	AES_LAST	XDATA1, [KEYS + 12*16]
+%else
+	AES_LAST	XDATA0, [KEYS + 14*16]	; 14. ENC
+	AES_LAST	XDATA1, [KEYS + 14*16]
+%endif
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+
+	cmp	LEN, 2*16
+	je	%%done
+	jmp	%%main_loop
+
+
+	align 16
+%%initial_1:
+	; load plain/cipher text
+	movdqu	XDATA0, [IN + 0*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	AES	XDATA0, [KEYS + 1*16]	; 1. ENC
+
+	mov	IDX, 1*16
+
+	AES	XDATA0, XKEY2		; 2. ENC
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	AES	XDATA0, [KEYS + 3*16]	; 3. ENC
+
+	AES	XDATA0, XKEY4		; 4. ENC
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	AES	XDATA0, [KEYS + 5*16]	; 5. ENC
+
+	AES	XDATA0, XKEY6		; 6. ENC
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	AES	XDATA0, [KEYS + 7*16]	; 7. ENC
+
+	AES	XDATA0, XKEY_B		; 8. ENC
+
+	movdqa	XKEY10, [KEYS + 10*16]
+
+	AES	XDATA0, [KEYS + 9*16]	; 9. ENC
+
+%if %%NROUNDS >= 12
+	AES	XDATA0, XKEY10		; 10. ENC
+
+	AES	XDATA0, [KEYS + 11*16]	; 11. ENC
+%endif
+
+%if %%NROUNDS == 14
+	AES	XDATA0, [KEYS + 12*16]	; 12. ENC
+
+	AES	XDATA0, [KEYS + 13*16]	; 13. ENC
+%endif
+
+%if %%NROUNDS == 10
+
+	AES_LAST	XDATA0, XKEY10	        ; 10. ENC
+%elif %%NROUNDS == 12
+	AES_LAST	XDATA0, [KEYS + 12*16]	; 12. ENC
+%else
+	AES_LAST	XDATA0, [KEYS + 14*16]	; 14. ENC
+%endif
+
+	movdqu	[OUT + 0*16], XDATA0
+
+	cmp	LEN, 1*16
+	je	%%done
+	jmp	%%main_loop
+
+
+%%initial_3:
+	; load plain/cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+	movdqu	XDATA2, [IN + 2*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	movdqa	XKEY_A, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	AES	XDATA0, XKEY_A		; 1. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 3*16]
+	mov	IDX, 3*16
+
+	AES	XDATA0, XKEY2		; 2. ENC
+	AES	XDATA1, XKEY2
+	AES	XDATA2, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	AES	XDATA0, XKEY_A		; 3. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 5*16]
+
+	AES	XDATA0, XKEY4		; 4. ENC
+	AES	XDATA1, XKEY4
+	AES	XDATA2, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	AES	XDATA0, XKEY_A		; 5. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 7*16]
+
+	AES	XDATA0, XKEY6		; 6. ENC
+	AES	XDATA1, XKEY6
+	AES	XDATA2, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	AES	XDATA0, XKEY_A		; 7. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 9*16]
+
+	AES	XDATA0, XKEY_B		; 8. ENC
+	AES	XDATA1, XKEY_B
+	AES	XDATA2, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 10*16]
+
+	AES	XDATA0, XKEY_A		; 9. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+
+%if %%NROUNDS >= 12
+	movdqa	XKEY_A, [KEYS + 11*16]
+
+	AES	XDATA0, XKEY_B		; 10. ENC
+	AES	XDATA1, XKEY_B
+	AES	XDATA2, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 12*16]
+
+	AES	XDATA0, XKEY_A		; 11. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+
+%endif
+
+%if %%NROUNDS == 14
+	movdqa	XKEY_A, [KEYS + 13*16]
+
+	AES	XDATA0, XKEY_B		; 12. ENC
+	AES	XDATA1, XKEY_B
+	AES	XDATA2, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 14*16]
+
+	AES	XDATA0, XKEY_A		; 13. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+%endif
+
+	AES_LAST	XDATA0, XKEY_B	; 10/12/14. ENC (depending on key size)
+	AES_LAST	XDATA1, XKEY_B
+	AES_LAST	XDATA2, XKEY_B
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+	movdqu	[OUT + 2*16], XDATA2
+
+	cmp	LEN, 3*16
+	je	%%done
+	jmp	%%main_loop
+
+
+	align 16
+%%initial_4:
+	; load plain/cipher text
+	movdqu	XDATA0, [IN + 0*16]
+	movdqu	XDATA1, [IN + 1*16]
+	movdqu	XDATA2, [IN + 2*16]
+	movdqu	XDATA3, [IN + 3*16]
+
+	movdqa	XKEY0, [KEYS + 0*16]
+
+	movdqa	XKEY_A, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+	pxor	XDATA3, XKEY0
+
+	movdqa	XKEY2, [KEYS + 2*16]
+
+	AES	XDATA0, XKEY_A		; 1. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 3*16]
+
+	mov	IDX, 4*16
+
+	AES	XDATA0, XKEY2		; 2. ENC
+	AES	XDATA1, XKEY2
+	AES	XDATA2, XKEY2
+	AES	XDATA3, XKEY2
+
+	movdqa	XKEY4, [KEYS + 4*16]
+
+	AES	XDATA0, XKEY_A		; 3. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 5*16]
+
+	AES	XDATA0, XKEY4		; 4. ENC
+	AES	XDATA1, XKEY4
+	AES	XDATA2, XKEY4
+	AES	XDATA3, XKEY4
+
+	movdqa	XKEY6, [KEYS + 6*16]
+
+	AES	XDATA0, XKEY_A		; 5. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 7*16]
+
+	AES	XDATA0, XKEY6		; 6. ENC
+	AES	XDATA1, XKEY6
+	AES	XDATA2, XKEY6
+	AES	XDATA3, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	AES	XDATA0, XKEY_A		; 7. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 9*16]
+
+	AES	XDATA0, XKEY_B		; 8. ENC
+	AES	XDATA1, XKEY_B
+	AES	XDATA2, XKEY_B
+	AES	XDATA3, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 10*16]
+
+	AES	XDATA0, XKEY_A		; 9. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+%if %%NROUNDS >= 12
+	movdqa	XKEY_A, [KEYS + 11*16]
+
+	AES	XDATA0, XKEY_B	; 10. ENC
+	AES	XDATA1, XKEY_B
+	AES	XDATA2, XKEY_B
+	AES	XDATA3, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 12*16]
+
+	AES	XDATA0, XKEY_A		; 11. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+%endif
+
+%if %%NROUNDS == 14
+	movdqa	XKEY_A, [KEYS + 13*16]
+
+	AES	XDATA0, XKEY_B		; 12. ENC
+	AES	XDATA1, XKEY_B
+	AES	XDATA2, XKEY_B
+	AES	XDATA3, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 14*16]
+
+	AES	XDATA0, XKEY_A		; 13. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+%endif
+
+	AES_LAST	XDATA0, XKEY_B	; 10/12/14. ENC (depending on key size)
+	AES_LAST	XDATA1, XKEY_B
+	AES_LAST	XDATA2, XKEY_B
+	AES_LAST	XDATA3, XKEY_B
+
+	movdqu	[OUT + 0*16], XDATA0
+	movdqu	[OUT + 1*16], XDATA1
+	movdqu	[OUT + 2*16], XDATA2
+	movdqu	[OUT + 3*16], XDATA3
+
+	cmp	LEN, 4*16
+	jz	%%done
+	jmp	%%main_loop
+
+	align 16
+%%main_loop:
+	; load plain/cipher text
+	movdqu	XDATA0, [IN + IDX + 0*16]
+	movdqu	XDATA1, [IN + IDX + 1*16]
+	movdqu	XDATA2, [IN + IDX + 2*16]
+	movdqu	XDATA3, [IN + IDX + 3*16]
+
+	movdqa	XKEY_A, [KEYS + 1*16]
+
+	pxor	XDATA0, XKEY0		; 0. ARK
+	pxor	XDATA1, XKEY0
+	pxor	XDATA2, XKEY0
+	pxor	XDATA3, XKEY0
+
+	add	IDX, 4*16
+
+	AES	XDATA0, XKEY_A		; 1. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 3*16]
+
+	AES	XDATA0, XKEY2		; 2. ENC
+	AES	XDATA1, XKEY2
+	AES	XDATA2, XKEY2
+	AES	XDATA3, XKEY2
+
+	AES	XDATA0, XKEY_A		; 3. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 5*16]
+
+	AES	XDATA0, XKEY4		; 4. ENC
+	AES	XDATA1, XKEY4
+	AES	XDATA2, XKEY4
+	AES	XDATA3, XKEY4
+
+	AES	XDATA0, XKEY_A		; 5. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 7*16]
+
+	AES	XDATA0, XKEY6		; 6. ENC
+	AES	XDATA1, XKEY6
+	AES	XDATA2, XKEY6
+	AES	XDATA3, XKEY6
+
+	movdqa	XKEY_B, [KEYS + 8*16]
+
+	AES	XDATA0, XKEY_A		; 7. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+	movdqa	XKEY_A, [KEYS + 9*16]
+
+	AES	XDATA0, XKEY_B		; 8. ENC
+	AES	XDATA1, XKEY_B
+	AES	XDATA2, XKEY_B
+	AES	XDATA3, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 10*16]
+
+        AES	XDATA0, XKEY_A		; 9. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+
+%if %%NROUNDS >= 12
+	movdqa	XKEY_A, [KEYS + 11*16]
+
+	AES	XDATA0, XKEY_B		; 10. ENC
+	AES	XDATA1, XKEY_B
+	AES	XDATA2, XKEY_B
+	AES	XDATA3, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 12*16]
+
+	AES	XDATA0, XKEY_A		; 11. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+%endif
+
+%if %%NROUNDS == 14
+	movdqa	XKEY_A, [KEYS + 13*16]
+
+	AES	XDATA0, XKEY_B		; 12. ENC
+	AES	XDATA1, XKEY_B
+	AES	XDATA2, XKEY_B
+	AES	XDATA3, XKEY_B
+
+	movdqa	XKEY_B, [KEYS + 14*16]
+
+	AES	XDATA0, XKEY_A		; 13. ENC
+	AES	XDATA1, XKEY_A
+	AES	XDATA2, XKEY_A
+	AES	XDATA3, XKEY_A
+%endif
+
+	AES_LAST	XDATA0, XKEY_B	; 10/12/14. ENC (depending on key size)
+	AES_LAST	XDATA1, XKEY_B
+	AES_LAST	XDATA2, XKEY_B
+	AES_LAST	XDATA3, XKEY_B
+
+	movdqu	[OUT + IDX + 0*16 - 4*16], XDATA0
+	movdqu	[OUT + IDX + 1*16 - 4*16], XDATA1
+	movdqu	[OUT + IDX + 2*16 - 4*16], XDATA2
+	movdqu	[OUT + IDX + 3*16 - 4*16], XDATA3
+
+	cmp     IDX, LEN
+	jne	%%main_loop
+
+%%done:
+
+	ret
+
+%endmacro
+
+align 16
+MKGLOBAL(AES_ECB_ENC_128,function,internal)
+AES_ECB_ENC_128:
+
+        AES_ECB 10, ENC
+
+align 16
+MKGLOBAL(AES_ECB_ENC_192,function,internal)
+AES_ECB_ENC_192:
+
+        AES_ECB 12, ENC
+
+align 16
+MKGLOBAL(AES_ECB_ENC_256,function,internal)
+AES_ECB_ENC_256:
+
+        AES_ECB 14, ENC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_128,function,internal)
+AES_ECB_DEC_128:
+
+        AES_ECB 10, DEC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_192,function,internal)
+AES_ECB_DEC_192:
+
+        AES_ECB 12, DEC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_256,function,internal)
+AES_ECB_DEC_256:
+
+        AES_ECB 14, DEC
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm
new file mode 100644
index 000000000..afbb38512
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm
@@ -0,0 +1,303 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do 128 bit AES XCBC
+;;; process 4 buffers at a time, single data structure as input
+;;; Updates In pointer at end
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+
+%ifndef AES_XCBC_X4
+%define AES_XCBC_X4 aes_xcbc_mac_128_x4
+%endif
+
+%define	MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+	MOVDQ	XTMP, %2
+	pxor	%1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_XCBC_ARGS_x8 {
+;;     void*    in[8];
+;;     UINT128* keys[8];
+;;     UINT128  ICV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_xcbc_mac_128_x4(AES_XCBC_ARGS_x8 *args, UINT64 len);
+;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure
+;; arg 2: LEN : len (in units of bytes)
+
+%ifdef LINUX
+%define ARG	rdi
+%define LEN	rsi
+%define REG3	rcx
+%define REG4	rdx
+%else
+%define ARG	rcx
+%define LEN	rdx
+%define REG3	rsi
+%define REG4	rdi
+%endif
+
+%define IDX	rax
+
+%define IN0	r8
+%define KEYS0	rbx
+%define OUT0	r9
+
+%define IN1	r10
+%define KEYS1	REG3
+%define OUT1	r11
+
+%define IN2	r12
+%define KEYS2	REG4
+%define OUT2	r13
+
+%define IN3	r14
+%define KEYS3	rbp
+%define OUT3	r15
+
+
+%define XDATA0		xmm0
+%define XDATA1		xmm1
+%define XDATA2		xmm2
+%define XDATA3		xmm3
+
+%define XKEY0_3		xmm4
+%define XKEY0_6		[KEYS0 + 16*6]
+%define XTMP		xmm5
+%define XKEY0_9		xmm6
+
+%define XKEY1_3		xmm7
+%define XKEY1_6		xmm8
+%define XKEY1_9		xmm9
+
+%define XKEY2_3		xmm10
+%define XKEY2_6		xmm11
+%define XKEY2_9		xmm12
+
+%define XKEY3_3		xmm13
+%define XKEY3_6		xmm14
+%define XKEY3_9		xmm15
+
+section .text
+
+MKGLOBAL(AES_XCBC_X4,function,internal)
+AES_XCBC_X4:
+
+	push	rbp
+
+	mov	IDX, 16
+
+	mov	IN0,	[ARG + _aesxcbcarg_in + 8*0]
+	mov	IN1,	[ARG + _aesxcbcarg_in + 8*1]
+	mov	IN2,	[ARG + _aesxcbcarg_in + 8*2]
+	mov	IN3,	[ARG + _aesxcbcarg_in + 8*3]
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+	MOVDQ		XDATA0, [IN0]		; load first block of plain text
+	MOVDQ		XDATA1, [IN1]		; load first block of plain text
+	MOVDQ		XDATA2, [IN2]		; load first block of plain text
+	MOVDQ		XDATA3, [IN3]		; load first block of plain text
+
+	mov		KEYS0,	[ARG + _aesxcbcarg_keys + 8*0]
+	mov		KEYS1,	[ARG + _aesxcbcarg_keys + 8*1]
+	mov		KEYS2,	[ARG + _aesxcbcarg_keys + 8*2]
+	mov		KEYS3,	[ARG + _aesxcbcarg_keys + 8*3]
+
+	pxor		XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV
+	pxor		XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV
+	pxor		XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV
+	pxor		XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV
+
+	pxor		XDATA0, [KEYS0 + 16*0]		; 0. ARK
+	pxor		XDATA1, [KEYS1 + 16*0]		; 0. ARK
+	pxor		XDATA2, [KEYS2 + 16*0]		; 0. ARK
+	pxor		XDATA3, [KEYS3 + 16*0]		; 0. ARK
+
+	aesenc		XDATA0, [KEYS0 + 16*1]	; 1. ENC
+	aesenc		XDATA1, [KEYS1 + 16*1]	; 1. ENC
+	aesenc		XDATA2, [KEYS2 + 16*1]	; 1. ENC
+	aesenc		XDATA3, [KEYS3 + 16*1]	; 1. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*2]	; 2. ENC
+	aesenc		XDATA1, [KEYS1 + 16*2]	; 2. ENC
+	aesenc		XDATA2, [KEYS2 + 16*2]	; 2. ENC
+	aesenc		XDATA3, [KEYS3 + 16*2]	; 2. ENC
+
+	movdqa		XKEY0_3, [KEYS0 + 16*3]	; load round 3 key
+	movdqa		XKEY1_3, [KEYS1 + 16*3]	; load round 3 key
+	movdqa		XKEY2_3, [KEYS2 + 16*3]	; load round 3 key
+	movdqa		XKEY3_3, [KEYS3 + 16*3]	; load round 3 key
+
+	aesenc		XDATA0, XKEY0_3		; 3. ENC
+	aesenc		XDATA1, XKEY1_3		; 3. ENC
+	aesenc		XDATA2, XKEY2_3		; 3. ENC
+	aesenc		XDATA3, XKEY3_3		; 3. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*4]	; 4. ENC
+	aesenc		XDATA1, [KEYS1 + 16*4]	; 4. ENC
+	aesenc		XDATA2, [KEYS2 + 16*4]	; 4. ENC
+	aesenc		XDATA3, [KEYS3 + 16*4]	; 4. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*5]	; 5. ENC
+	aesenc		XDATA1, [KEYS1 + 16*5]	; 5. ENC
+	aesenc		XDATA2, [KEYS2 + 16*5]	; 5. ENC
+	aesenc		XDATA3, [KEYS3 + 16*5]	; 5. ENC
+
+	movdqa		XKEY1_6, [KEYS1 + 16*6]	; load round 6 key
+	movdqa		XKEY2_6, [KEYS2 + 16*6]	; load round 6 key
+	movdqa		XKEY3_6, [KEYS3 + 16*6]	; load round 6 key
+
+	aesenc		XDATA0, XKEY0_6		; 6. ENC
+	aesenc		XDATA1, XKEY1_6		; 6. ENC
+	aesenc		XDATA2, XKEY2_6		; 6. ENC
+	aesenc		XDATA3, XKEY3_6		; 6. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*7]	; 7. ENC
+	aesenc		XDATA1, [KEYS1 + 16*7]	; 7. ENC
+	aesenc		XDATA2, [KEYS2 + 16*7]	; 7. ENC
+	aesenc		XDATA3, [KEYS3 + 16*7]	; 7. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*8]	; 8. ENC
+	aesenc		XDATA1, [KEYS1 + 16*8]	; 8. ENC
+	aesenc		XDATA2, [KEYS2 + 16*8]	; 8. ENC
+	aesenc		XDATA3, [KEYS3 + 16*8]	; 8. ENC
+
+	movdqa		XKEY0_9, [KEYS0 + 16*9]	; load round 9 key
+	movdqa		XKEY1_9, [KEYS1 + 16*9]	; load round 9 key
+	movdqa		XKEY2_9, [KEYS2 + 16*9]	; load round 9 key
+	movdqa		XKEY3_9, [KEYS3 + 16*9]	; load round 9 key
+
+	aesenc		XDATA0, XKEY0_9		; 9. ENC
+	aesenc		XDATA1, XKEY1_9		; 9. ENC
+	aesenc		XDATA2, XKEY2_9		; 9. ENC
+	aesenc		XDATA3, XKEY3_9		; 9. ENC
+
+	aesenclast	XDATA0, [KEYS0 + 16*10]	; 10. ENC
+	aesenclast	XDATA1, [KEYS1 + 16*10]	; 10. ENC
+	aesenclast	XDATA2, [KEYS2 + 16*10]	; 10. ENC
+	aesenclast	XDATA3, [KEYS3 + 16*10]	; 10. ENC
+
+	cmp		LEN, IDX
+	je		done
+
+main_loop:
+	pxor2		XDATA0, [IN0 + IDX]	; plaintext XOR ICV
+	pxor2		XDATA1, [IN1 + IDX]	; plaintext XOR ICV
+	pxor2		XDATA2, [IN2 + IDX]	; plaintext XOR ICV
+	pxor2		XDATA3, [IN3 + IDX]	; plaintext XOR ICV
+
+	pxor		XDATA0, [KEYS0 + 16*0] 	; 0. ARK
+	pxor		XDATA1, [KEYS1 + 16*0] 	; 0. ARK
+	pxor		XDATA2, [KEYS2 + 16*0] 	; 0. ARK
+	pxor		XDATA3, [KEYS3 + 16*0] 	; 0. ARK
+
+	aesenc		XDATA0, [KEYS0 + 16*1]	; 1. ENC
+	aesenc		XDATA1, [KEYS1 + 16*1]	; 1. ENC
+	aesenc		XDATA2, [KEYS2 + 16*1]	; 1. ENC
+	aesenc		XDATA3, [KEYS3 + 16*1]	; 1. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*2]	; 2. ENC
+	aesenc		XDATA1, [KEYS1 + 16*2]	; 2. ENC
+	aesenc		XDATA2, [KEYS2 + 16*2]	; 2. ENC
+	aesenc		XDATA3, [KEYS3 + 16*2]	; 2. ENC
+
+	aesenc		XDATA0, XKEY0_3		; 3. ENC
+	aesenc		XDATA1, XKEY1_3		; 3. ENC
+	aesenc		XDATA2, XKEY2_3		; 3. ENC
+	aesenc		XDATA3, XKEY3_3		; 3. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*4]	; 4. ENC
+	aesenc		XDATA1, [KEYS1 + 16*4]	; 4. ENC
+	aesenc		XDATA2, [KEYS2 + 16*4]	; 4. ENC
+	aesenc		XDATA3, [KEYS3 + 16*4]	; 4. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*5]	; 5. ENC
+	aesenc		XDATA1, [KEYS1 + 16*5]	; 5. ENC
+	aesenc		XDATA2, [KEYS2 + 16*5]	; 5. ENC
+	aesenc		XDATA3, [KEYS3 + 16*5]	; 5. ENC
+
+	aesenc		XDATA0, XKEY0_6		; 6. ENC
+	aesenc		XDATA1, XKEY1_6		; 6. ENC
+	aesenc		XDATA2, XKEY2_6		; 6. ENC
+	aesenc		XDATA3, XKEY3_6		; 6. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*7]	; 7. ENC
+	aesenc		XDATA1, [KEYS1 + 16*7]	; 7. ENC
+	aesenc		XDATA2, [KEYS2 + 16*7]	; 7. ENC
+	aesenc		XDATA3, [KEYS3 + 16*7]	; 7. ENC
+
+	aesenc		XDATA0, [KEYS0 + 16*8]	; 8. ENC
+	aesenc		XDATA1, [KEYS1 + 16*8]	; 8. ENC
+	aesenc		XDATA2, [KEYS2 + 16*8]	; 8. ENC
+	aesenc		XDATA3, [KEYS3 + 16*8]	; 8. ENC
+
+	aesenc		XDATA0, XKEY0_9		; 9. ENC
+	aesenc		XDATA1, XKEY1_9		; 9. ENC
+	aesenc		XDATA2, XKEY2_9		; 9. ENC
+	aesenc		XDATA3, XKEY3_9		; 9. ENC
+
+	aesenclast	XDATA0, [KEYS0 + 16*10]	; 10. ENC
+	aesenclast	XDATA1, [KEYS1 + 16*10]	; 10. ENC
+	aesenclast	XDATA2, [KEYS2 + 16*10]	; 10. ENC
+	aesenclast	XDATA3, [KEYS3 + 16*10]	; 10. ENC
+
+	add	IDX, 16
+	cmp	LEN, IDX
+	jne	main_loop
+
+done:
+	;; update ICV
+	movdqa	[ARG + _aesxcbcarg_ICV + 16*0], XDATA0
+	movdqa	[ARG + _aesxcbcarg_ICV + 16*1], XDATA1
+	movdqa	[ARG + _aesxcbcarg_ICV + 16*2], XDATA2
+	movdqa	[ARG + _aesxcbcarg_ICV + 16*3], XDATA3
+
+	;; update IN
+	add	IN0, LEN
+	mov	[ARG + _aesxcbcarg_in + 8*0], IN0
+	add	IN1, LEN
+	mov	[ARG + _aesxcbcarg_in + 8*1], IN1
+	add	IN2, LEN
+	mov	[ARG + _aesxcbcarg_in + 8*2], IN2
+	add	IN3, LEN
+	mov	[ARG + _aesxcbcarg_in + 8*3], IN3
+
+	pop	rbp
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm
new file mode 100644
index 000000000..b8d3ea963
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm
@@ -0,0 +1,30 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define GCM128_MODE 1
+%include "sse/gcm_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm
new file mode 100644
index 000000000..68e995a06
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2017-2018, Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM192_MODE 1
+%include "sse/gcm_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm
new file mode 100644
index 000000000..3898411a1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "sse/gcm_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm
new file mode 100644
index 000000000..d053da51f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm
@@ -0,0 +1,2586 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authors:
+;       Erdinc Ozturk
+;       Vinodh Gopal
+;       James Guilford
+;
+;
+; References:
+;       This code was derived and highly optimized from the code described in paper:
+;               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+;       For the shift-based reductions used in this code, we used the method described in paper:
+;               Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                             Salt  (From the SA)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     Initialization Vector                     |
+;       |         (This is the sequence number from IPSec header)       |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x1                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+;       AAD will be padded with 0 to the next 16byte multiple
+;       for example, assume AAD is a u32 vector
+;
+;       if AAD is 8 bytes:
+;       AAD[3] = {A0, A1};
+;       padded AAD in xmm register = {A1 A0 0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A1)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     32-bit Sequence Number (A0)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;                                       AAD Format with 32-bit Sequence Number
+;
+;       if AAD is 12 bytes:
+;       AAD[3] = {A0, A1, A2};
+;       padded AAD in xmm register = {A2 A1 A0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A2)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                 64-bit Extended Sequence Number {A1,A0}       |
+;       |                                                               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;        AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+;       Must be a multiple of 4 bytes and from the definition of the spec.
+;       The code additionally supports any aadLen length.
+;
+; TLen:
+;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/clear_regs.asm"
+%include "include/gcm_defines.asm"
+%include "include/gcm_keys_sse_avx.asm"
+%include "include/memcpy.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_sse.asm!"
+%endif
+%endif
+%endif
+
+%ifdef NO_AESNI
+%define SSE sse_no_aesni
+%else
+%define SSE sse
+%endif
+
+%ifdef GCM128_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ SSE
+%define NROUNDS 9
+%endif
+
+%ifdef GCM192_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ SSE
+%define NROUNDS 11
+%endif
+
+%ifdef GCM256_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ SSE
+%define NROUNDS 13
+%endif
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define	TMP2	16*0    ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define	TMP3	16*1    ; Temporary storage for AES State 3
+%define	TMP4	16*2    ; Temporary storage for AES State 4
+%define	TMP5	16*3    ; Temporary storage for AES State 5
+%define	TMP6	16*4    ; Temporary storage for AES State 6
+%define	TMP7	16*5    ; Temporary storage for AES State 7
+%define	TMP8	16*6    ; Temporary storage for AES State 8
+
+%define	LOCAL_STORAGE	16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define	XMM_STORAGE	16*10
+%else
+	%define	XMM_STORAGE	0
+%endif
+
+%define	VARIABLE_OFFSET	LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GHASH_MUL  7
+%define %%GH %1         ; 16 Bytes
+%define %%HK %2         ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+        ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; Karatsuba Method
+        movdqa  %%T1, %%GH
+        pshufd  %%T2, %%GH, 01001110b
+        pshufd  %%T3, %%HK, 01001110b
+        pxor    %%T2, %%GH                              ; %%T2 = (a1+a0)
+        pxor    %%T3, %%HK                              ; %%T3 = (b1+b0)
+
+        pclmulqdq       %%T1, %%HK, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%GH, %%HK, 0x00                ; %%GH = a0*b0
+        pclmulqdq       %%T2, %%T3, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T2, %%GH
+        pxor    %%T2, %%T1                              ; %%T2 = a0*b1+a1*b0
+
+        movdqa  %%T3, %%T2
+        pslldq  %%T3, 8                                 ; shift-L %%T3 2 DWs
+        psrldq  %%T2, 8                                 ; shift-R %%T2 2 DWs
+        pxor    %%GH, %%T3
+        pxor    %%T1, %%T2                              ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
+
+
+        ;first phase of the reduction
+        movdqa  %%T2, %%GH
+        movdqa  %%T3, %%GH
+        movdqa  %%T4, %%GH                              ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+        pslld   %%T2, 31                                ; packed right shifting << 31
+        pslld   %%T3, 30                                ; packed right shifting shift << 30
+        pslld   %%T4, 25                                ; packed right shifting shift << 25
+        pxor    %%T2, %%T3                              ; xor the shifted versions
+        pxor    %%T2, %%T4
+
+        movdqa  %%T5, %%T2
+        psrldq  %%T5, 4                                 ; shift-R %%T5 1 DW
+
+        pslldq  %%T2, 12                                ; shift-L %%T2 3 DWs
+        pxor    %%GH, %%T2                              ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        ;second phase of the reduction
+        movdqa  %%T2,%%GH                               ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
+        movdqa  %%T3,%%GH
+        movdqa  %%T4,%%GH
+
+        psrld   %%T2,1                                  ; packed left shifting >> 1
+        psrld   %%T3,2                                  ; packed left shifting >> 2
+        psrld   %%T4,7                                  ; packed left shifting >> 7
+        pxor    %%T2,%%T3                               ; xor the shifted versions
+        pxor    %%T2,%%T4
+
+        pxor    %%T2, %%T5
+        pxor    %%GH, %%T2
+        pxor    %%GH, %%T1                              ; the result is in %%T1
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define	%%GDATA	%1
+%define	%%HK	%2
+%define	%%T1	%3
+%define	%%T2	%4
+%define	%%T3	%5
+%define	%%T4	%6
+%define	%%T5	%7
+%define	%%T6	%8
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+        movdqa  %%T4, %%HK
+        pshufd  %%T1, %%HK, 01001110b
+        pxor    %%T1, %%HK
+        movdqu  [%%GDATA + HashKey_k], %%T1
+
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6      ;  %%T4 = HashKey^2<<1 mod poly
+        movdqu  [%%GDATA + HashKey_2], %%T4                         ;  [HashKey_2] = HashKey^2<<1 mod poly
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_2_k], %%T1
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^3<<1 mod poly
+        movdqu  [%%GDATA + HashKey_3], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_3_k], %%T1
+
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^4<<1 mod poly
+        movdqu  [%%GDATA + HashKey_4], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_4_k], %%T1
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^5<<1 mod poly
+        movdqu  [%%GDATA + HashKey_5], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_5_k], %%T1
+
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^6<<1 mod poly
+        movdqu  [%%GDATA + HashKey_6], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_6_k], %%T1
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^7<<1 mod poly
+        movdqu  [%%GDATA + HashKey_7], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_7_k], %%T1
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^8<<1 mod poly
+        movdqu  [%%GDATA + HashKey_8], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_8_k], %%T1
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT	6
+%define	%%OUTPUT		%1 ; %%OUTPUT is an xmm register
+%define	%%INPUT			%2
+%define	%%LENGTH		%3
+%define	%%END_READ_LOCATION	%4 ; All this and the lower inputs are temp registers
+%define	%%COUNTER		%5
+%define	%%TMP1			%6
+
+	pxor	%%OUTPUT, %%OUTPUT
+	mov	%%COUNTER, %%LENGTH
+	mov	%%END_READ_LOCATION, %%INPUT
+	add	%%END_READ_LOCATION, %%LENGTH
+	xor	%%TMP1, %%TMP1
+
+
+	cmp	%%COUNTER, 8
+	jl	%%_byte_loop_2
+	pinsrq	%%OUTPUT, [%%INPUT],0		;Read in 8 bytes if they exists
+	je	%%_done
+
+	sub	%%COUNTER, 8
+
+%%_byte_loop_1:					;Read in data 1 byte at a time while data is left
+	shl	%%TMP1, 8			;This loop handles when 8 bytes were already read in
+	dec	%%END_READ_LOCATION
+	mov	BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+	dec	%%COUNTER
+	jg	%%_byte_loop_1
+	pinsrq	%%OUTPUT, %%TMP1, 1
+	jmp	%%_done
+
+%%_byte_loop_2:					;Read in data 1 byte at a time while data is left
+	cmp	%%COUNTER, 0
+	je	%%_done
+	shl	%%TMP1, 8			;This loop handles when no bytes were already read in
+	dec	%%END_READ_LOCATION
+	mov	BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+	dec	%%COUNTER
+	jg	%%_byte_loop_2
+	pinsrq	%%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  CALC_AAD_HASH   15
+%define %%A_IN          %1
+%define %%A_LEN         %2
+%define %%AAD_HASH      %3
+%define %%GDATA_KEY     %4
+%define %%XTMP0         %5      ; xmm temp reg 5
+%define %%XTMP1         %6      ; xmm temp reg 5
+%define %%XTMP2         %7
+%define %%XTMP3         %8
+%define %%XTMP4         %9
+%define %%XTMP5         %10     ; xmm temp reg 5
+%define %%T1            %11     ; temp reg 1
+%define %%T2            %12
+%define %%T3            %13
+%define %%T4            %14
+%define %%T5            %15     ; temp reg 5
+
+
+        mov     %%T1, %%A_IN            ; T1 = AAD
+        mov     %%T2, %%A_LEN           ; T2 = aadLen
+        pxor    %%AAD_HASH, %%AAD_HASH
+
+%%_get_AAD_loop128:
+        cmp     %%T2, 128
+        jl      %%_exit_AAD_loop128
+
+        movdqu          %%XTMP0, [%%T1 + 16*0]
+        pshufb          %%XTMP0, [rel SHUF_MASK]
+
+        pxor            %%XTMP0, %%AAD_HASH
+
+        movdqu          %%XTMP5, [%%GDATA_KEY + HashKey_8]
+        movdqa          %%XTMP1, %%XTMP0
+        movdqa          %%XTMP2, %%XTMP0
+        movdqa          %%XTMP3, %%XTMP0
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP1, %%XTMP5, 0x11                 ; %%T1 = a1*b1
+        pclmulqdq       %%XTMP2, %%XTMP5, 0x00                 ; %%T2 = a0*b0
+        pclmulqdq       %%XTMP3, %%XTMP5, 0x01                 ; %%T3 = a1*b0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x10                 ; %%T4 = a0*b1
+        pxor            %%XTMP3, %%XTMP4                       ; %%T3 = a1*b0 + a0*b1
+
+%assign i 1
+%assign j 7
+%rep 7
+        movdqu          %%XTMP0, [%%T1 + 16*i]
+        pshufb          %%XTMP0, [rel SHUF_MASK]
+
+        movdqu          %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x11                          ; %%T1 = T1 + a1*b1
+        pxor            %%XTMP1, %%XTMP4
+
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x00                          ; %%T2 = T2 + a0*b0
+        pxor            %%XTMP2, %%XTMP4
+
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x01                          ; %%T3 = T3 + a1*b0 + a0*b1
+        pxor            %%XTMP3, %%XTMP4
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x10
+        pxor            %%XTMP3, %%XTMP4
+%assign i (i + 1)
+%assign j (j - 1)
+%endrep
+
+        movdqa          %%XTMP4, %%XTMP3
+        pslldq          %%XTMP4, 8                                      ; shift-L 2 DWs
+        psrldq          %%XTMP3, 8                                      ; shift-R 2 DWs
+        pxor            %%XTMP2, %%XTMP4
+        pxor            %%XTMP1, %%XTMP3                                ; accumulate the results in %%T1(M):%%T2(L)
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;first phase of the reduction
+        movdqa          %%XTMP5, [rel POLY2]
+        movdqa          %%XTMP0, %%XTMP5
+        pclmulqdq       %%XTMP0, %%XTMP2, 0x01
+        pslldq          %%XTMP0, 8                                      ; shift-L xmm2 2 DWs
+        pxor            %%XTMP2, %%XTMP0                                ; first phase of the reduction complete
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;second phase of the reduction
+        movdqa          %%XTMP3, %%XTMP5
+        pclmulqdq       %%XTMP3, %%XTMP2, 0x00
+        psrldq          %%XTMP3, 4                                      ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+        movdqa          %%XTMP4, %%XTMP5
+        pclmulqdq       %%XTMP4, %%XTMP2, 0x10
+        pslldq          %%XTMP4, 4                                      ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+        pxor            %%XTMP4, %%XTMP3                                ; second phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        movdqa          %%AAD_HASH, %%XTMP1
+        pxor            %%AAD_HASH, %%XTMP4                             ; the result is in %%T1
+
+        sub     %%T2, 128
+        je      %%_CALC_AAD_done
+
+        add     %%T1, 128
+        jmp     %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+        cmp     %%T2, 16
+        jl      %%_get_small_AAD_block
+
+        ;; calculate hash_key position to start with
+        mov     %%T3, %%T2
+        and     %%T3, -16       ; 1 to 7 blocks possible here
+        neg     %%T3
+        add     %%T3, HashKey_1 + 16
+        lea     %%T3, [%%GDATA_KEY + %%T3]
+
+        movdqu          %%XTMP0, [%%T1]
+        pshufb          %%XTMP0, [rel SHUF_MASK]
+
+        pxor            %%XTMP0, %%AAD_HASH
+
+        movdqu          %%XTMP5, [%%T3]
+        movdqa          %%XTMP1, %%XTMP0
+        movdqa          %%XTMP2, %%XTMP0
+        movdqa          %%XTMP3, %%XTMP0
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP1, %%XTMP5, 0x11                 ; %%T1 = a1*b1
+        pclmulqdq       %%XTMP2, %%XTMP5, 0x00                 ; %%T2 = a0*b0
+        pclmulqdq       %%XTMP3, %%XTMP5, 0x01                 ; %%T3 = a1*b0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x10                 ; %%T4 = a0*b1
+        pxor            %%XTMP3, %%XTMP4                       ; %%T3 = a1*b0 + a0*b1
+
+        add     %%T3, 16        ; move to next hashkey
+        add     %%T1, 16        ; move to next data block
+        sub     %%T2, 16
+        cmp     %%T2, 16
+        jl      %%_AAD_reduce
+
+%%_AAD_blocks:
+        movdqu          %%XTMP0, [%%T1]
+        pshufb          %%XTMP0, [rel SHUF_MASK]
+
+        movdqu          %%XTMP5, [%%T3]
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x11                 ; %%T1 = T1 + a1*b1
+        pxor            %%XTMP1, %%XTMP4
+
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x00                 ; %%T2 = T2 + a0*b0
+        pxor            %%XTMP2, %%XTMP4
+
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x01                 ; %%T3 = T3 + a1*b0 + a0*b1
+        pxor            %%XTMP3, %%XTMP4
+        movdqa          %%XTMP4, %%XTMP0
+        pclmulqdq       %%XTMP4, %%XTMP5, 0x10
+        pxor            %%XTMP3, %%XTMP4
+
+        add     %%T3, 16        ; move to next hashkey
+        add     %%T1, 16
+        sub     %%T2, 16
+        cmp     %%T2, 16
+        jl      %%_AAD_reduce
+        jmp     %%_AAD_blocks
+
+%%_AAD_reduce:
+        movdqa          %%XTMP4, %%XTMP3
+        pslldq          %%XTMP4, 8                             ; shift-L 2 DWs
+        psrldq          %%XTMP3, 8                             ; shift-R 2 DWs
+        pxor            %%XTMP2, %%XTMP4
+        pxor            %%XTMP1, %%XTMP3                       ; accumulate the results in %%T1(M):%%T2(L)
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;first phase of the reduction
+        movdqa          %%XTMP5, [rel POLY2]
+        movdqa          %%XTMP0, %%XTMP5
+        pclmulqdq       %%XTMP0, %%XTMP2, 0x01
+        pslldq          %%XTMP0, 8                             ; shift-L xmm2 2 DWs
+        pxor            %%XTMP2, %%XTMP0                       ; first phase of the reduction complete
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;second phase of the reduction
+        movdqa          %%XTMP3, %%XTMP5
+        pclmulqdq       %%XTMP3, %%XTMP2, 0x00
+        psrldq          %%XTMP3, 4                             ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+        movdqa          %%XTMP4, %%XTMP5
+        pclmulqdq       %%XTMP4, %%XTMP2, 0x10
+        pslldq          %%XTMP4, 4                             ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+        pxor            %%XTMP4, %%XTMP3                       ; second phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        movdqa          %%AAD_HASH, %%XTMP1
+        pxor            %%AAD_HASH, %%XTMP4                    ; the result is in %%T1
+
+        or      %%T2, %%T2
+        je      %%_CALC_AAD_done
+
+%%_get_small_AAD_block:
+        movdqu          %%XTMP0, [%%GDATA_KEY + HashKey]
+        READ_SMALL_DATA_INPUT   %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+        ;byte-reflect the AAD data
+        pshufb          %%XTMP1, [rel SHUF_MASK]
+        pxor            %%AAD_HASH, %%XTMP1
+        GHASH_MUL       %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
+; and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK	8
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%CYPH_PLAIN_OUT	%3
+%define	%%PLAIN_CYPH_IN		%4
+%define	%%PLAIN_CYPH_LEN	%5
+%define	%%DATA_OFFSET		%6
+%define	%%AAD_HASH		%7
+%define	%%ENC_DEC		%8
+	mov	r13, [%%GDATA_CTX + PBlockLen]
+	cmp	r13, 0
+	je	%%_partial_block_done		;Leave Macro if no partial blocks
+
+	cmp	%%PLAIN_CYPH_LEN, 16		;Read in input data without over reading
+	jl	%%_fewer_than_16_bytes
+	XLDR	xmm1, [%%PLAIN_CYPH_IN]		;If more than 16 bytes of data, just fill the xmm register
+	jmp	%%_data_read
+
+%%_fewer_than_16_bytes:
+	lea	r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+	READ_SMALL_DATA_INPUT	xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+	mov	r13, [%%GDATA_CTX + PBlockLen]
+
+%%_data_read:				;Finished reading in data
+
+
+	movdqu	xmm9, [%%GDATA_CTX + PBlockEncKey]	;xmm9 = ctx_data.partial_block_enc_key
+	movdqu	xmm13, [%%GDATA_KEY + HashKey]
+
+	lea	r12, [SHIFT_MASK]
+
+	add	r12, r13			; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+	movdqu	xmm2, [r12]			; get the appropriate shuffle mask
+	pshufb	xmm9, xmm2			;shift right r13 bytes
+
+%ifidn	%%ENC_DEC, DEC
+	movdqa	xmm3, xmm1
+	pxor	xmm9, xmm1			; Cyphertext XOR E(K, Yn)
+
+	mov	r15, %%PLAIN_CYPH_LEN
+	add	r15, r13
+	sub	r15, 16				;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+	jge	%%_no_extra_mask_1		;Determine if if partial block is not being filled and shift mask accordingly
+	sub	r12, r15
+%%_no_extra_mask_1:
+
+	movdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]	; get the appropriate mask to mask out bottom r13 bytes of xmm9
+	pand	xmm9, xmm1			; mask out bottom r13 bytes of xmm9
+
+	pand	xmm3, xmm1
+	pshufb	xmm3, [SHUF_MASK]
+	pshufb	xmm3, xmm2
+	pxor	%%AAD_HASH, xmm3
+
+
+	cmp	r15,0
+	jl	%%_partial_incomplete_1
+
+	GHASH_MUL	%%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6	;GHASH computation for the last <16 Byte block
+	xor	rax,rax
+	mov	[%%GDATA_CTX + PBlockLen], rax
+	jmp	%%_dec_done
+%%_partial_incomplete_1:
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     rax, %%PLAIN_CYPH_LEN
+       	add     [%%GDATA_CTX + PBlockLen], rax
+%else
+       	add	[%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_dec_done:
+	movdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+	pxor	xmm9, xmm1	; Plaintext XOR E(K, Yn)
+
+	mov	r15, %%PLAIN_CYPH_LEN
+	add	r15, r13
+	sub	r15, 16				;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+	jge	%%_no_extra_mask_2		;Determine if if partial block is not being filled and shift mask accordingly
+	sub	r12, r15
+%%_no_extra_mask_2:
+
+	movdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]	; get the appropriate mask to mask out bottom r13 bytes of xmm9
+	pand	xmm9, xmm1			; mask out bottom r13  bytes of xmm9
+
+	pshufb	xmm9, [SHUF_MASK]
+	pshufb	xmm9, xmm2
+	pxor	%%AAD_HASH, xmm9
+
+	cmp	r15,0
+	jl	%%_partial_incomplete_2
+
+	GHASH_MUL	%%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6	;GHASH computation for the last <16 Byte block
+	xor	rax,rax
+	mov	[%%GDATA_CTX + PBlockLen], rax
+	jmp	%%_encode_done
+%%_partial_incomplete_2:
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     rax, %%PLAIN_CYPH_LEN
+       	add     [%%GDATA_CTX + PBlockLen], rax
+%else
+       	add     [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_encode_done:
+	movdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH
+
+	pshufb	xmm9, [SHUF_MASK]	; shuffle xmm9 back to output as ciphertext
+	pshufb	xmm9, xmm2
+%endif
+
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	; output encrypted Bytes
+	cmp	r15,0
+	jl	%%_partial_fill
+	mov	r12, r13
+	mov	r13, 16
+	sub	r13, r12			; Set r13 to be the number of bytes to write out
+	jmp	%%_count_set
+%%_partial_fill:
+	mov	r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+	movq	rax, xmm9
+	cmp	r13, 8
+	jle	%%_less_than_8_bytes_left
+
+	mov	[%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+	add	%%DATA_OFFSET, 8
+	psrldq	xmm9, 8
+	movq	rax, xmm9
+	sub	r13, 8
+%%_less_than_8_bytes_left:
+	mov	BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+	add	%%DATA_OFFSET, 1
+	shr	rax, 8
+	sub	r13, 1
+	jne	%%_less_than_8_bytes_left
+         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 24
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%CYPH_PLAIN_OUT	%3
+%define	%%PLAIN_CYPH_IN		%4
+%define	%%LENGTH		%5
+%define	%%DATA_OFFSET		%6
+%define	%%num_initial_blocks	%7	; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define	%%T1		%8
+%define	%%HASH_KEY	%9
+%define	%%T3		%10
+%define	%%T4		%11
+%define	%%T5		%12
+%define	%%CTR		%13
+%define	%%XMM1		%14
+%define	%%XMM2		%15
+%define	%%XMM3		%16
+%define	%%XMM4		%17
+%define	%%XMM5		%18
+%define	%%XMM6		%19
+%define	%%XMM7		%20
+%define	%%XMM8		%21
+%define	%%T6		%22
+%define	%%T_key		%23
+%define	%%ENC_DEC	%24
+
+%assign i       (8-%%num_initial_blocks)
+		movdqu	reg(i), %%XMM8	; move AAD_HASH to temp reg
+
+	        ; start AES for %%num_initial_blocks blocks
+	        movdqu  %%CTR, [%%GDATA_CTX + CurCount]	; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                paddd   %%CTR, [ONE]           ; INCR Y0
+                movdqa  reg(i), %%CTR
+                pshufb  reg(i), [SHUF_MASK]     ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+movdqu  %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                pxor    reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS							; encrypt N blocks with 13 key rounds (11 for GCM192)
+movdqu  %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                aesenc  reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+movdqu  %%T_key, [%%GDATA_KEY+16*j]				; encrypt with last (14th) key round (12 for GCM192)
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                aesenclast      reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+                pxor    reg(i), %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)            ; write back ciphertext for %%num_initial_blocks blocks
+                add     %%DATA_OFFSET, 16
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  reg(i), %%T1
+                %endif
+                pshufb  reg(i), [SHUF_MASK]     ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+        pxor    reg(j), reg(i)
+        GHASH_MUL       reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6      ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+        ; %%XMM8 has the current Hash Value
+        movdqa  %%T3, %%XMM8
+
+        cmp     %%LENGTH, 128
+        jl      %%_initial_blocks_done
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Prepare 8 counter blocks and perform rounds of AES cipher on them, load plain/cipher text and store cipher/plain text.
+; Keep 8 cipher text blocks for further GHASH computations (XMM1 - XMM8)
+; - combine current GHASH value into block 0 (XMM1)
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM1, %%CTR
+                pshufb  %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM2, %%CTR
+                pshufb  %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM3, %%CTR
+                pshufb  %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM4, %%CTR
+                pshufb  %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM5, %%CTR
+                pshufb  %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM6, %%CTR
+                pshufb  %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM7, %%CTR
+                pshufb  %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM8, %%CTR
+                pshufb  %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+
+                movdqu  %%T_key, [%%GDATA_KEY+16*0]
+                pxor    %%XMM1, %%T_key
+                pxor    %%XMM2, %%T_key
+                pxor    %%XMM3, %%T_key
+                pxor    %%XMM4, %%T_key
+                pxor    %%XMM5, %%T_key
+                pxor    %%XMM6, %%T_key
+                pxor    %%XMM7, %%T_key
+                pxor    %%XMM8, %%T_key
+
+
+%assign i 1
+%rep    NROUNDS       						; do early (13) rounds (11 for GCM192)
+                movdqu  %%T_key, [%%GDATA_KEY+16*i]
+                aesenc  %%XMM1, %%T_key
+                aesenc  %%XMM2, %%T_key
+                aesenc  %%XMM3, %%T_key
+                aesenc  %%XMM4, %%T_key
+                aesenc  %%XMM5, %%T_key
+                aesenc  %%XMM6, %%T_key
+                aesenc  %%XMM7, %%T_key
+                aesenc  %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+                movdqu          %%T_key, [%%GDATA_KEY+16*i]		; do final key round
+                aesenclast      %%XMM1, %%T_key
+                aesenclast      %%XMM2, %%T_key
+                aesenclast      %%XMM3, %%T_key
+                aesenclast      %%XMM4, %%T_key
+                aesenclast      %%XMM5, %%T_key
+                aesenclast      %%XMM6, %%T_key
+                aesenclast      %%XMM7, %%T_key
+                aesenclast      %%XMM8, %%T_key
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+                pxor    %%XMM1, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM1, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+                pxor    %%XMM2, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM2, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+                pxor    %%XMM3, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM3, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+                pxor    %%XMM4, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM4, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+                pxor    %%XMM5, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM5, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+                pxor    %%XMM6, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM6, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+                pxor    %%XMM7, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM7, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+                pxor    %%XMM8, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM8, %%T1
+                %endif
+
+                add     %%DATA_OFFSET, 128
+
+                pshufb  %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+                pxor    %%XMM1, %%T3                    ; combine GHASHed value with the corresponding ciphertext
+                pshufb  %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define	%%GDATA			%1
+%define	%%CYPH_PLAIN_OUT	%2
+%define	%%PLAIN_CYPH_IN		%3
+%define	%%DATA_OFFSET		%4
+%define	%%T1	%5
+%define	%%T2	%6
+%define	%%T3	%7
+%define	%%T4	%8
+%define	%%T5	%9
+%define	%%T6	%10
+%define	%%CTR	%11
+%define	%%XMM1	%12
+%define	%%XMM2	%13
+%define	%%XMM3	%14
+%define	%%XMM4	%15
+%define	%%XMM5	%16
+%define	%%XMM6	%17
+%define	%%XMM7	%18
+%define	%%XMM8	%19
+%define	%%T7	%20
+%define	%%loop_idx	%21
+%define	%%ENC_DEC	%22
+
+        movdqa  %%T7, %%XMM1
+        movdqu  [rsp + TMP2], %%XMM2
+        movdqu  [rsp + TMP3], %%XMM3
+        movdqu  [rsp + TMP4], %%XMM4
+        movdqu  [rsp + TMP5], %%XMM5
+        movdqu  [rsp + TMP6], %%XMM6
+        movdqu  [rsp + TMP7], %%XMM7
+        movdqu  [rsp + TMP8], %%XMM8
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; Karatsuba Method
+
+        movdqa  %%T4, %%T7
+        pshufd  %%T6, %%T7, 01001110b
+        pxor    %%T6, %%T7
+                %ifidn %%loop_idx, in_order
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                %else
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                %endif
+        movdqu  %%T5, [%%GDATA + HashKey_8]
+        pclmulqdq       %%T4, %%T5, 0x11                        ; %%T1 = a1*b1
+        pclmulqdq       %%T7, %%T5, 0x00                        ; %%T7 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_8_k]
+        pclmulqdq       %%T6, %%T5, 0x00                        ; %%T2 = (a1+a0)*(b1+b0)
+                movdqa %%XMM1, %%CTR
+
+                %ifidn %%loop_idx, in_order
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM2, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM3, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM4, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM5, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM6, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM7, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM8, %%CTR
+
+                pshufb  %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+                %else
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM2, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM3, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM4, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM5, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM6, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM7, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM8, %%CTR
+                %endif
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                movdqu  %%T1, [%%GDATA + 16*0]
+                pxor    %%XMM1, %%T1
+                pxor    %%XMM2, %%T1
+                pxor    %%XMM3, %%T1
+                pxor    %%XMM4, %%T1
+                pxor    %%XMM5, %%T1
+                pxor    %%XMM6, %%T1
+                pxor    %%XMM7, %%T1
+                pxor    %%XMM8, %%T1
+
+        ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; Karatsuba Method
+        movdqu  %%T1, [rsp + TMP2]
+        movdqa  %%T3, %%T1
+
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_7]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_7_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*1]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+
+                movdqu  %%T1, [%%GDATA + 16*2]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; Karatsuba Method
+        movdqu  %%T1, [rsp + TMP3]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_6]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_6_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*3]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+        movdqu  %%T1, [rsp + TMP4]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_5]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_5_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*4]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+                movdqu  %%T1, [%%GDATA + 16*5]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+        movdqu  %%T1, [rsp + TMP5]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_4]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_4_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+
+                movdqu  %%T1, [%%GDATA + 16*6]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+        movdqu  %%T1, [rsp + TMP6]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_3]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_3_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*7]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+        movdqu  %%T1, [rsp + TMP7]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_2]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_2_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*8]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+
+        ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; Karatsuba Method
+        movdqu  %%T1, [rsp + TMP8]
+        movdqa  %%T3, %%T1
+
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T7, %%T3
+        pxor    %%T4, %%T1
+
+                movdqu  %%T1, [%%GDATA + 16*9]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+
+%ifdef GCM128_MODE
+		movdqu	%%T5, [%%GDATA + 16*10]
+%endif
+%ifdef GCM192_MODE
+		movdqu	%%T1, [%%GDATA + 16*10]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T1, [%%GDATA + 16*11]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T5, [%%GDATA + 16*12]        ; finish last key round
+%endif
+%ifdef GCM256_MODE
+		movdqu	%%T1, [%%GDATA + 16*10]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T1, [%%GDATA + 16*11]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T1, [%%GDATA + 16*12]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T1, [%%GDATA + 16*13]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+	        movdqu	%%T5, [%%GDATA + 16*14]        ; finish last key round
+%endif
+
+%assign i 0
+%assign j 1
+%rep 8
+                XLDR  %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+
+%ifidn %%ENC_DEC, DEC
+                movdqa  %%T3, %%T1
+%endif
+
+                pxor    %%T1, %%T5
+                aesenclast      reg(j), %%T1          ; XMM1:XMM8
+                XSTR  [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j)       ; Write to the Output buffer
+
+%ifidn %%ENC_DEC, DEC
+                movdqa  reg(j), %%T3
+%endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+
+
+        pxor    %%T2, %%T6
+        pxor    %%T2, %%T4
+        pxor    %%T2, %%T7
+
+
+        movdqa  %%T3, %%T2
+        pslldq  %%T3, 8                                 ; shift-L %%T3 2 DWs
+        psrldq  %%T2, 8                                 ; shift-R %%T2 2 DWs
+        pxor    %%T7, %%T3
+        pxor    %%T4, %%T2                              ; accumulate the results in %%T4:%%T7
+
+
+
+        ;first phase of the reduction
+        movdqa  %%T2, %%T7
+        movdqa  %%T3, %%T7
+        movdqa  %%T1, %%T7                              ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
+
+        pslld   %%T2, 31                                ; packed right shifting << 31
+        pslld   %%T3, 30                                ; packed right shifting shift << 30
+        pslld   %%T1, 25                                ; packed right shifting shift << 25
+        pxor    %%T2, %%T3                              ; xor the shifted versions
+        pxor    %%T2, %%T1
+
+        movdqa  %%T5, %%T2
+        psrldq  %%T5, 4                                 ; shift-R %%T5 1 DW
+
+        pslldq  %%T2, 12                                ; shift-L %%T2 3 DWs
+        pxor    %%T7, %%T2                              ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                pshufb  %%XMM1, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM2, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM3, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM4, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM5, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM6, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM7, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM8, [SHUF_MASK]     ; perform a 16Byte swap
+
+        ;second phase of the reduction
+        movdqa  %%T2,%%T7                               ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
+        movdqa  %%T3,%%T7
+        movdqa  %%T1,%%T7
+
+        psrld   %%T2,1                                  ; packed left shifting >> 1
+        psrld   %%T3,2                                  ; packed left shifting >> 2
+        psrld   %%T1,7                                  ; packed left shifting >> 7
+        pxor    %%T2,%%T3                               ; xor the shifted versions
+        pxor    %%T2,%%T1
+
+        pxor    %%T2, %%T5
+        pxor    %%T7, %%T2
+        pxor    %%T7, %%T4                              ; the result is in %%T4
+
+
+        pxor    %%XMM1, %%T7
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro	GHASH_LAST_8 16
+%define	%%GDATA	%1
+%define	%%T1	%2
+%define	%%T2	%3
+%define	%%T3	%4
+%define	%%T4	%5
+%define	%%T5	%6
+%define	%%T6	%7
+%define	%%T7	%8
+%define	%%XMM1	%9
+%define	%%XMM2	%10
+%define	%%XMM3	%11
+%define	%%XMM4	%12
+%define	%%XMM5	%13
+%define	%%XMM6	%14
+%define	%%XMM7	%15
+%define	%%XMM8	%16
+
+        ; Karatsuba Method
+        movdqa  %%T6, %%XMM1
+        pshufd  %%T2, %%XMM1, 01001110b
+        pxor    %%T2, %%XMM1
+        movdqu  %%T5, [%%GDATA + HashKey_8]
+        pclmulqdq       %%T6, %%T5, 0x11                ; %%T6 = a1*b1
+
+        pclmulqdq       %%XMM1, %%T5, 0x00              ; %%XMM1 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_8_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        movdqa  %%T7, %%XMM1
+        movdqa  %%XMM1, %%T2                            ; result in %%T6, %%T7, %%XMM1
+
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM2
+        pshufd  %%T2, %%XMM2, 01001110b
+        pxor    %%T2, %%XMM2
+        movdqu  %%T5, [%%GDATA + HashKey_7]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM2, %%T5, 0x00              ; %%XMM2 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_7_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM2
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM3
+        pshufd  %%T2, %%XMM3, 01001110b
+        pxor    %%T2, %%XMM3
+        movdqu  %%T5, [%%GDATA + HashKey_6]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM3, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_6_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM3
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM4
+        pshufd  %%T2, %%XMM4, 01001110b
+        pxor    %%T2, %%XMM4
+        movdqu  %%T5, [%%GDATA + HashKey_5]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM4, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_5_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM4
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM5
+        pshufd  %%T2, %%XMM5, 01001110b
+        pxor    %%T2, %%XMM5
+        movdqu  %%T5, [%%GDATA + HashKey_4]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM5, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_4_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM5
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM6
+        pshufd  %%T2, %%XMM6, 01001110b
+        pxor    %%T2, %%XMM6
+        movdqu  %%T5, [%%GDATA + HashKey_3]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM6, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_3_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM6
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM7
+        pshufd  %%T2, %%XMM7, 01001110b
+        pxor    %%T2, %%XMM7
+        movdqu  %%T5, [%%GDATA + HashKey_2]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM7, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_2_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM7
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM8
+        pshufd  %%T2, %%XMM8, 01001110b
+        pxor    %%T2, %%XMM8
+        movdqu  %%T5, [%%GDATA + HashKey]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM8, %%T5, 0x00              ; %%XMM4 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM8
+        pxor    %%T2, %%XMM1
+        pxor    %%T2, %%T6
+        pxor    %%T2, %%T7                              ; middle section of the temp results combined as in Karatsuba algorithm
+
+
+        movdqa  %%T4, %%T2
+        pslldq  %%T4, 8                                 ; shift-L %%T4 2 DWs
+        psrldq  %%T2, 8                                 ; shift-R %%T2 2 DWs
+        pxor    %%T7, %%T4
+        pxor    %%T6, %%T2                              ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+
+        ;first phase of the reduction
+        movdqa %%T2, %%T7
+        movdqa %%T3, %%T7
+        movdqa %%T4, %%T7                               ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+        pslld %%T2, 31                                  ; packed right shifting << 31
+        pslld %%T3, 30                                  ; packed right shifting shift << 30
+        pslld %%T4, 25                                  ; packed right shifting shift << 25
+        pxor %%T2, %%T3                                 ; xor the shifted versions
+        pxor %%T2, %%T4
+
+        movdqa %%T1, %%T2
+        psrldq %%T1, 4                                  ; shift-R %%T1 1 DW
+
+        pslldq %%T2, 12                                 ; shift-L %%T2 3 DWs
+        pxor %%T7, %%T2                                 ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        ;second phase of the reduction
+        movdqa %%T2,%%T7                                ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
+        movdqa %%T3,%%T7
+        movdqa %%T4,%%T7
+
+        psrld %%T2,1                                    ; packed left shifting >> 1
+        psrld %%T3,2                                    ; packed left shifting >> 2
+        psrld %%T4,7                                    ; packed left shifting >> 7
+        pxor %%T2,%%T3                                  ; xor the shifted versions
+        pxor %%T2,%%T4
+
+        pxor %%T2, %%T1
+        pxor %%T7, %%T2
+        pxor %%T6, %%T7                                 ; the result is in %%T6
+
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 3
+%define	%%GDATA	%1
+%define	%%ST	%2
+%define	%%T1	%3
+		movdqu	%%T1, [%%GDATA+16*0]
+                pxor    %%ST, %%T1
+%assign i 1
+%rep NROUNDS
+		movdqu	%%T1, [%%GDATA+16*i]
+                aesenc  %%ST, %%T1
+%assign i (i+1)
+%endrep
+		movdqu	%%T1, [%%GDATA+16*i]
+                aesenclast      %%ST, %%T1
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+	;; Required for Update/GCM_ENC
+	;the number of pushes must equal STACK_OFFSET
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+        mov     r14, rsp
+
+	sub     rsp, VARIABLE_OFFSET
+	and     rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; xmm6:xmm15 need to be maintained for Windows
+        movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+        movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+        movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+        movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+        movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+        movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+        movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+        movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+        movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+        movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+        clear_scratch_gps_asm
+        clear_scratch_xmms_sse_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+        movdqu xmm15  , [rsp + LOCAL_STORAGE + 9*16]
+        movdqu xmm14  , [rsp + LOCAL_STORAGE + 8*16]
+        movdqu xmm13  , [rsp + LOCAL_STORAGE + 7*16]
+        movdqu xmm12  , [rsp + LOCAL_STORAGE + 6*16]
+        movdqu xmm11  , [rsp + LOCAL_STORAGE + 5*16]
+        movdqu xmm10  , [rsp + LOCAL_STORAGE + 4*16]
+        movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+        movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+        movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+        movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GCM_ENC
+        mov     rsp, r14
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GCM_INIT 	5
+%define %%GDATA_KEY	%1
+%define %%GDATA_CTX	%2
+%define %%IV		%3
+%define %%A_IN		%4
+%define %%A_LEN		%5
+%define %%AAD_HASH	xmm0
+
+	CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+	pxor	xmm2, xmm3
+	mov	r10, %%A_LEN
+
+	movdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH	; ctx_data.aad hash = aad_hash
+	mov	[%%GDATA_CTX + AadLen], r10		; ctx_data.aad_length = aad_length
+	xor	r10, r10
+	mov	[%%GDATA_CTX + InLen], r10		; ctx_data.in_length = 0
+	mov	[%%GDATA_CTX + PBlockLen], r10		; ctx_data.partial_block_length = 0
+	movdqu	[%%GDATA_CTX + PBlockEncKey], xmm2	; ctx_data.partial_block_enc_key = 0
+	mov	r10, %%IV
+        movdqa  xmm2, [rel ONEf]                        ; read 12 IV bytes and pad with 0x00000001
+        pinsrq  xmm2, [r10], 0
+        pinsrd  xmm2, [r10+8], 2
+	movdqu	[%%GDATA_CTX + OrigIV], xmm2		; ctx_data.orig_IV = iv
+
+	pshufb xmm2, [SHUF_MASK]
+
+	movdqu	[%%GDATA_CTX + CurCount], xmm2		; ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data
+; struct has been initialized by GCM_INIT.
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro	GCM_ENC_DEC		6
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%CYPH_PLAIN_OUT	%3
+%define	%%PLAIN_CYPH_IN		%4
+%define	%%PLAIN_CYPH_LEN	%5
+%define	%%ENC_DEC		%6
+%define	%%DATA_OFFSET		r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+	cmp	%%PLAIN_CYPH_LEN, 0
+	je	%%_multiple_of_16_bytes
+
+	xor	%%DATA_OFFSET, %%DATA_OFFSET
+%ifidn __OUTPUT_FORMAT__, win64
+       	mov	r12, %%PLAIN_CYPH_LEN
+       	add	[%%GDATA_CTX + InLen], r12 ;Update length of data processed
+%else
+       	add	[%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+%endif
+	movdqu	xmm13, [%%GDATA_KEY + HashKey]                 ; xmm13 = HashKey
+	movdqu	xmm8, [%%GDATA_CTX + AadHash]
+
+
+	PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+        mov     r13, %%PLAIN_CYPH_LEN                               ; save the number of bytes of plaintext/ciphertext
+	sub	r13, %%DATA_OFFSET
+	mov	r10, r13	;save the amount of data left to process in r10
+        and     r13, -16                                ; r13 = r13 - (r13 mod 16)
+
+        mov     r12, r13
+        shr     r12, 4
+        and     r12, 7
+        jz      %%_initial_num_blocks_is_0
+
+        cmp     r12, 7
+        je      %%_initial_num_blocks_is_7
+        cmp     r12, 6
+        je      %%_initial_num_blocks_is_6
+        cmp     r12, 5
+        je      %%_initial_num_blocks_is_5
+        cmp     r12, 4
+        je      %%_initial_num_blocks_is_4
+        cmp     r12, 3
+        je      %%_initial_num_blocks_is_3
+        cmp     r12, 2
+        je      %%_initial_num_blocks_is_2
+
+        jmp     %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*7
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*6
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*5
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*4
+        jmp     %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*3
+        jmp     %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*2
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+        cmp     r13, 0
+        je      %%_zero_cipher_left
+
+        sub     r13, 128
+        je      %%_eight_cipher_left
+
+
+
+
+        movd    r15d, xmm9
+        and     r15d, 255
+        pshufb  xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+        cmp     r15d, 255-8
+        jg      %%_encrypt_by_8
+
+
+
+        add     r15b, 8
+	GHASH_8_ENCRYPT_8_PARALLEL	%%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+        add     %%DATA_OFFSET, 128
+        sub     r13, 128
+        jne     %%_encrypt_by_8_new
+
+        pshufb  xmm9, [SHUF_MASK]
+        jmp     %%_eight_cipher_left
+
+%%_encrypt_by_8:
+        pshufb  xmm9, [SHUF_MASK]
+        add     r15b, 8
+	GHASH_8_ENCRYPT_8_PARALLEL	%%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+        pshufb  xmm9, [SHUF_MASK]
+        add     %%DATA_OFFSET, 128
+        sub     r13, 128
+        jne     %%_encrypt_by_8_new
+
+        pshufb  xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+	GHASH_LAST_8	%%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+	movdqu	[%%GDATA_CTX + AadHash], xmm14
+	movdqu	[%%GDATA_CTX + CurCount], xmm9
+
+        mov     r13, r10
+        and     r13, 15                                ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+        je      %%_multiple_of_16_bytes
+
+	mov	[%%GDATA_CTX + PBlockLen], r13		; my_ctx.data.partial_blck_length = r13
+        ; handle the last <16 Byte block seperately
+
+        paddd   xmm9, [ONE]                     ; INCR CNT to get Yn
+	movdqu	[%%GDATA_CTX + CurCount], xmm9		; my_ctx.data.current_counter = xmm9
+        pshufb  xmm9, [SHUF_MASK]
+	ENCRYPT_SINGLE_BLOCK	%%GDATA_KEY, xmm9, xmm2                    ; E(K, Yn)
+	movdqu	[%%GDATA_CTX + PBlockEncKey], xmm9		; my_ctx_data.partial_block_enc_key = xmm9
+
+	cmp	%%PLAIN_CYPH_LEN, 16
+	jge	%%_large_enough_update
+
+	lea	r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+	READ_SMALL_DATA_INPUT	xmm1, r10, r13, r12, r15, rax
+	lea	r12, [SHIFT_MASK + 16]
+	sub	r12, r13
+	jmp	%%_data_read
+
+%%_large_enough_update:
+        sub     %%DATA_OFFSET, 16
+        add     %%DATA_OFFSET, r13
+
+        movdqu  xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET]                        ; receive the last <16 Byte block
+
+	sub     %%DATA_OFFSET, r13
+        add     %%DATA_OFFSET, 16
+
+        lea     r12, [SHIFT_MASK + 16]
+        sub     r12, r13                                ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+        movdqu  xmm2, [r12]                             ; get the appropriate shuffle mask
+        pshufb  xmm1, xmm2                              ; shift right 16-r13 bytes
+%%_data_read:
+        %ifidn  %%ENC_DEC, DEC
+        movdqa  xmm2, xmm1
+        pxor    xmm9, xmm1                              ; Plaintext XOR E(K, Yn)
+        movdqu  xmm1, [r12 + ALL_F - SHIFT_MASK]        ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+        pand    xmm9, xmm1                              ; mask out top 16-r13 bytes of xmm9
+        pand    xmm2, xmm1
+        pshufb  xmm2, [SHUF_MASK]
+        pxor    xmm14, xmm2
+	movdqu	[%%GDATA_CTX + AadHash], xmm14
+
+        %else
+        pxor    xmm9, xmm1                              ; Plaintext XOR E(K, Yn)
+        movdqu  xmm1, [r12 + ALL_F - SHIFT_MASK]        ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+        pand    xmm9, xmm1                              ; mask out top 16-r13 bytes of xmm9
+        pshufb  xmm9, [SHUF_MASK]
+        pxor    xmm14, xmm9
+	movdqu	[%%GDATA_CTX + AadHash], xmm14
+
+        pshufb  xmm9, [SHUF_MASK]               ; shuffle xmm9 back to output as ciphertext
+        %endif
+
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; output r13 Bytes
+        movq    rax, xmm9
+        cmp     r13, 8
+        jle     %%_less_than_8_bytes_left
+
+        mov     [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+        add     %%DATA_OFFSET, 8
+        psrldq  xmm9, 8
+        movq    rax, xmm9
+        sub     r13, 8
+
+%%_less_than_8_bytes_left:
+        mov     BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+        add     %%DATA_OFFSET, 1
+        shr     rax, 8
+        sub     r13, 1
+        jne     %%_less_than_8_bytes_left
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and
+; whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro	GCM_COMPLETE		5
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%AUTH_TAG		%3
+%define	%%AUTH_TAG_LEN		%4
+%define	%%ENC_DEC		%5
+%define	%%PLAIN_CYPH_LEN	rax
+
+        mov     r12, [%%GDATA_CTX + PBlockLen]		; r12 = aadLen (number of bytes)
+	movdqu	xmm14, [%%GDATA_CTX + AadHash]
+	movdqu	xmm13, [%%GDATA_KEY + HashKey]
+
+	cmp	r12, 0
+
+	je %%_partial_done
+
+	GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+	movdqu	[%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+	mov	r12, [%%GDATA_CTX + AadLen]			; r12 = aadLen (number of bytes)
+	mov	%%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+        shl     r12, 3                                  ; convert into number of bits
+        movd    xmm15, r12d                             ; len(A) in xmm15
+
+        shl     %%PLAIN_CYPH_LEN, 3                     ; len(C) in bits  (*128)
+        movq    xmm1, %%PLAIN_CYPH_LEN
+        pslldq  xmm15, 8                                ; xmm15 = len(A)|| 0x0000000000000000
+        pxor    xmm15, xmm1                             ; xmm15 = len(A)||len(C)
+
+        pxor    xmm14, xmm15
+        GHASH_MUL       xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6    ; final GHASH computation
+        pshufb  xmm14, [SHUF_MASK]                      ; perform a 16Byte swap
+
+        movdqu  xmm9, [%%GDATA_CTX + OrigIV]            ; xmm9 = Y0
+
+	ENCRYPT_SINGLE_BLOCK	%%GDATA_KEY, xmm9, xmm2	; E(K, Y0)
+
+        pxor    xmm9, xmm14
+
+
+
+%%_return_T:
+	mov	r10, %%AUTH_TAG				; r10 = authTag
+	mov	r11, %%AUTH_TAG_LEN			; r11 = auth_tag_len
+
+        cmp     r11, 16
+        je      %%_T_16
+
+        cmp     r11, 12
+        je      %%_T_12
+
+        cmp     r11, 8
+        je      %%_T_8
+
+        simd_store_sse r10, xmm9, r11, r12, rax
+        jmp     %%_return_T_done
+%%_T_8:
+        movq    rax, xmm9
+        mov     [r10], rax
+        jmp     %%_return_T_done
+%%_T_12:
+        movq    rax, xmm9
+        mov     [r10], rax
+        psrldq  xmm9, 8
+        movd    eax, xmm9
+        mov     [r10 + 8], eax
+        jmp     %%_return_T_done
+%%_T_16:
+        movdqu  [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+        ;; Clear sensitive data from context structure
+        pxor    xmm0, xmm0
+        movdqu	[%%GDATA_CTX + AadHash], xmm0
+        movdqu  [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+
+%endmacro ;GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void	aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse
+;        (struct gcm_key_data *key_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(precomp,_),function,)
+FN_NAME(precomp,_):
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_precomp
+%endif
+
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+
+        mov     r14, rsp
+
+
+
+        sub     rsp, VARIABLE_OFFSET
+        and     rsp, ~63                                ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; only xmm6 needs to be maintained
+        movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+	pxor	xmm6, xmm6
+	ENCRYPT_SINGLE_BLOCK	arg1, xmm6, xmm2	; xmm6 = HashKey
+
+        pshufb  xmm6, [SHUF_MASK]
+        ;;;;;;;;;;;;;;;  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+        movdqa  xmm2, xmm6
+        psllq   xmm6, 1
+        psrlq   xmm2, 63
+        movdqa  xmm1, xmm2
+        pslldq  xmm2, 8
+        psrldq  xmm1, 8
+        por     xmm6, xmm2
+        ;reduction
+        pshufd  xmm2, xmm1, 00100100b
+        pcmpeqd xmm2, [TWOONE]
+        pand    xmm2, [POLY]
+        pxor    xmm6, xmm2                             ; xmm6 holds the HashKey<<1 mod poly
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        movdqu  [arg1 + HashKey], xmm6                  ; store HashKey<<1 mod poly
+
+
+        PRECOMPUTE  arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifdef SAFE_DATA
+        clear_scratch_gps_asm
+        clear_scratch_xmms_sse_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+       movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+        mov     rsp, r14
+
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+
+exit_precomp:
+
+        ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse (
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *iv,
+;        const   u8 *aad,
+;        u64     aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(init,_),function,)
+FN_NAME(init,_):
+	push	r12
+	push	r13
+%ifidn __OUTPUT_FORMAT__, win64
+        push    r14
+        push    r15
+        mov     r14, rsp
+	; xmm6:xmm15 need to be maintained for Windows
+	sub	rsp, 1*16
+	movdqu	[rsp + 0*16], xmm6
+%endif
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_init
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_init
+
+        ;; Check IV != NULL
+        cmp     arg3, 0
+        jz      exit_init
+
+        ;; Check if aad_len == 0
+        cmp     arg5, 0
+        jz      skip_aad_check_init
+
+        ;; Check aad != NULL (aad_len != 0)
+        cmp     arg4, 0
+        jz      exit_init
+
+skip_aad_check_init:
+%endif
+	GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifdef SAFE_DATA
+        clear_scratch_gps_asm
+        clear_scratch_xmms_sse_asm
+%endif
+exit_init:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqu	xmm6 , [rsp + 0*16]
+        mov     rsp, r14
+        pop     r15
+        pop     r14
+%endif
+	pop	r13
+	pop	r12
+        ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_update_),function,)
+FN_NAME(enc,_update_):
+
+	FUNC_SAVE
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_update_enc
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_update_enc
+
+        ;; Check if plaintext_len == 0
+        cmp     arg5, 0
+        jz      skip_in_out_check_update_enc
+
+        ;; Check out != NULL (plaintext_len != 0)
+        cmp     arg3, 0
+        jz      exit_update_enc
+
+        ;; Check in != NULL (plaintext_len != 0)
+        cmp     arg4, 0
+        jz      exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+	GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+exit_update_enc:
+	FUNC_RESTORE
+
+	ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_update_),function,)
+FN_NAME(dec,_update_):
+
+	FUNC_SAVE
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_update_dec
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_update_dec
+
+        ;; Check if plaintext_len == 0
+        cmp     arg5, 0
+        jz      skip_in_out_check_update_dec
+
+        ;; Check out != NULL (plaintext_len != 0)
+        cmp     arg3, 0
+        jz      exit_update_dec
+
+        ;; Check in != NULL (plaintext_len != 0)
+        cmp     arg4, 0
+        jz      exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+	GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+exit_update_dec:
+	FUNC_RESTORE
+
+	ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_finalize_),function,)
+FN_NAME(enc,_finalize_):
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_enc_fin
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_enc_fin
+
+        ;; Check auth_tag != NULL
+        cmp     arg3, 0
+        jz      exit_enc_fin
+
+        ;; Check auth_tag_len == 0 or > 16
+        cmp     arg4, 0
+        jz      exit_enc_fin
+
+        cmp     arg4, 16
+        ja      exit_enc_fin
+%endif
+	push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+	; xmm6:xmm15 need to be maintained for Windows
+	sub	rsp, 5*16
+	movdqu	[rsp + 0*16],xmm6
+	movdqu	[rsp + 1*16],xmm9
+	movdqu	[rsp + 2*16],xmm11
+	movdqu	[rsp + 3*16],xmm14
+	movdqu	[rsp + 4*16],xmm15
+%endif
+
+	GCM_COMPLETE	arg1, arg2, arg3, arg4, ENC
+
+%ifdef SAFE_DATA
+        clear_scratch_gps_asm
+        clear_scratch_xmms_sse_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqu	xmm15  , [rsp + 4*16]
+	movdqu	xmm14  , [rsp+ 3*16]
+	movdqu	xmm11  , [rsp + 2*16]
+	movdqu	xmm9 , [rsp + 1*16]
+	movdqu	xmm6 , [rsp + 0*16]
+	add	rsp, 5*16
+%endif
+
+	pop r12
+
+exit_enc_fin:
+        ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_finalize_),function,)
+FN_NAME(dec,_finalize_):
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_dec_fin
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_dec_fin
+
+        ;; Check auth_tag != NULL
+        cmp     arg3, 0
+        jz      exit_dec_fin
+
+        ;; Check auth_tag_len == 0 or > 16
+        cmp     arg4, 0
+        jz      exit_dec_fin
+
+        cmp     arg4, 16
+        ja      exit_dec_fin
+%endif
+
+	push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+	; xmm6:xmm15 need to be maintained for Windows
+	sub	rsp, 5*16
+	movdqu	[rsp + 0*16],xmm6
+	movdqu	[rsp + 1*16],xmm9
+	movdqu	[rsp + 2*16],xmm11
+	movdqu	[rsp + 3*16],xmm14
+	movdqu	[rsp + 4*16],xmm15
+%endif
+	GCM_COMPLETE	arg1, arg2, arg3, arg4, DEC
+
+%ifdef SAFE_DATA
+        clear_scratch_gps_asm
+        clear_scratch_xmms_sse_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqu	xmm15  , [rsp + 4*16]
+	movdqu	xmm14  , [rsp+ 3*16]
+	movdqu	xmm11  , [rsp + 2*16]
+	movdqu	xmm9 , [rsp + 1*16]
+	movdqu	xmm6 , [rsp + 0*16]
+	add	rsp, 5*16
+%endif
+
+	pop r12
+
+exit_dec_fin:
+        ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len,
+;        u8      *iv,
+;        const   u8 *aad,
+;        u64     aad_len,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_),function,)
+FN_NAME(enc,_):
+
+	FUNC_SAVE
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_enc
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_enc
+
+        ;; Check IV != NULL
+        cmp     arg6, 0
+        jz      exit_enc
+
+        ;; Check auth_tag != NULL
+        cmp     arg9, 0
+        jz      exit_enc
+
+        ;; Check auth_tag_len == 0 or > 16
+        cmp     arg10, 0
+        jz      exit_enc
+
+        cmp     arg10, 16
+        ja      exit_enc
+
+        ;; Check if plaintext_len == 0
+        cmp     arg5, 0
+        jz      skip_in_out_check_enc
+
+        ;; Check out != NULL (plaintext_len != 0)
+        cmp     arg3, 0
+        jz      exit_enc
+
+        ;; Check in != NULL (plaintext_len != 0)
+        cmp     arg4, 0
+        jz      exit_enc
+
+skip_in_out_check_enc:
+        ;; Check if aad_len == 0
+        cmp     arg8, 0
+        jz      skip_aad_check_enc
+
+        ;; Check aad != NULL (aad_len != 0)
+        cmp     arg7, 0
+        jz      exit_enc
+
+skip_aad_check_enc:
+%endif
+	GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+	GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, ENC
+
+	GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
+
+exit_enc:
+	FUNC_RESTORE
+
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len,
+;        u8      *iv,
+;        const   u8 *aad,
+;        u64     aad_len,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_),function,)
+FN_NAME(dec,_):
+
+	FUNC_SAVE
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_dec
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_dec
+
+        ;; Check IV != NULL
+        cmp     arg6, 0
+        jz      exit_dec
+
+        ;; Check auth_tag != NULL
+        cmp     arg9, 0
+        jz      exit_dec
+
+        ;; Check auth_tag_len == 0 or > 16
+        cmp     arg10, 0
+        jz      exit_dec
+
+        cmp     arg10, 16
+        ja      exit_dec
+
+        ;; Check if plaintext_len == 0
+        cmp     arg5, 0
+        jz      skip_in_out_check_dec
+
+        ;; Check out != NULL (plaintext_len != 0)
+        cmp     arg3, 0
+        jz      exit_dec
+
+        ;; Check in != NULL (plaintext_len != 0)
+        cmp     arg4, 0
+        jz      exit_dec
+
+skip_in_out_check_dec:
+        ;; Check if aad_len == 0
+        cmp     arg8, 0
+        jz      skip_aad_check_dec
+
+        ;; Check aad != NULL (aad_len != 0)
+        cmp     arg7, 0
+        jz      exit_dec
+
+skip_aad_check_dec:
+%endif
+
+	GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+	GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, DEC
+
+	GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
+
+exit_dec:
+	FUNC_RESTORE
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/kasumi_sse.c b/src/spdk/intel-ipsec-mb/sse/kasumi_sse.c
new file mode 100644
index 000000000..b1ef71a8a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/kasumi_sse.c
@@ -0,0 +1,385 @@
+/*******************************************************************************
+  Copyright (c) 2009-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <limits.h>
+
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_sse
+
+#include "include/kasumi_internal.h"
+#include "include/save_xmms.h"
+#include "include/clear_regs_mem.h"
+
+#define SAVE_XMMS               save_xmms
+#define RESTORE_XMMS            restore_xmms
+
+void
+kasumi_f8_1_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+                       const void *pBufferIn, void *pBufferOut,
+                       const uint32_t cipherLengthInBytes)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL)
+                return;
+
+        /* Check input data is in range of supported length */
+        if (cipherLengthInBytes == 0 ||
+            cipherLengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+                return;
+#endif
+        kasumi_f8_1_buffer(pCtx, IV, pBufferIn, pBufferOut,
+                           cipherLengthInBytes);
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_1_buffer_bit_sse(const kasumi_key_sched_t *pCtx,
+                           const uint64_t IV,
+                           const void *pBufferIn, void *pBufferOut,
+                           const uint32_t cipherLengthInBits,
+                           const uint32_t offsetInBits)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL)
+                return;
+
+        /* Check input data is in range of supported length */
+        if (cipherLengthInBits == 0 ||
+            cipherLengthInBits > KASUMI_MAX_LEN)
+                return;
+#endif
+        kasumi_f8_1_buffer_bit(pCtx, IV, pBufferIn, pBufferOut,
+                               cipherLengthInBits, offsetInBits);
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_2_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+                       const uint64_t IV2, const void *pBufferIn1,
+                       void *pBufferOut1, const uint32_t lengthInBytes1,
+                       const void *pBufferIn2, void *pBufferOut2,
+                       const uint32_t lengthInBytes2)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pCtx == NULL)
+                return;
+
+        if (pBufferIn1 == NULL || pBufferOut1 == NULL)
+                return;
+
+        if (pBufferIn2 == NULL || pBufferOut2 == NULL)
+                return;
+
+        /* Check input data is in range of supported length */
+        if (lengthInBytes1 == 0 || lengthInBytes1 > (KASUMI_MAX_LEN / CHAR_BIT))
+                return;
+
+        if (lengthInBytes2 == 0 || lengthInBytes2 > (KASUMI_MAX_LEN / CHAR_BIT))
+                return;
+#endif
+        kasumi_f8_2_buffer(pCtx, IV1, IV2,
+                           pBufferIn1, pBufferOut1, lengthInBytes1,
+                           pBufferIn2, pBufferOut2, lengthInBytes2);
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_3_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+                       const uint64_t IV2, const uint64_t IV3,
+                       const void *pBufferIn1, void *pBufferOut1,
+                       const void *pBufferIn2, void *pBufferOut2,
+                       const void *pBufferIn3, void *pBufferOut3,
+                       const uint32_t lengthInBytes)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pCtx == NULL)
+                return;
+
+        if (pBufferIn1 == NULL || pBufferOut1 == NULL)
+                return;
+
+        if (pBufferIn2 == NULL || pBufferOut2 == NULL)
+                return;
+
+        if (pBufferIn3 == NULL || pBufferOut3 == NULL)
+                return;
+
+        /* Check input data is in range of supported length */
+        if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+                return;
+#endif
+        kasumi_f8_3_buffer(pCtx, IV1, IV2, IV3,
+                           pBufferIn1, pBufferOut1,
+                           pBufferIn2, pBufferOut2,
+                           pBufferIn3, pBufferOut3, lengthInBytes);
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_4_buffer_sse(const kasumi_key_sched_t *pCtx,
+                       const uint64_t IV1, const uint64_t IV2,
+                       const uint64_t IV3, const uint64_t IV4,
+                       const void *pBufferIn1, void *pBufferOut1,
+                       const void *pBufferIn2, void *pBufferOut2,
+                       const void *pBufferIn3, void *pBufferOut3,
+                       const void *pBufferIn4, void *pBufferOut4,
+                       const uint32_t lengthInBytes)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pCtx == NULL)
+                return;
+
+        if (pBufferIn1 == NULL || pBufferOut1 == NULL)
+                return;
+
+        if (pBufferIn2 == NULL || pBufferOut2 == NULL)
+                return;
+
+        if (pBufferIn3 == NULL || pBufferOut3 == NULL)
+                return;
+
+        if (pBufferIn4 == NULL || pBufferOut4 == NULL)
+                return;
+
+        /* Check input data is in range of supported length */
+        if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+                return;
+#endif
+        kasumi_f8_4_buffer(pCtx, IV1, IV2, IV3, IV4,
+                           pBufferIn1, pBufferOut1,
+                           pBufferIn2, pBufferOut2,
+                           pBufferIn3, pBufferOut3,
+                           pBufferIn4, pBufferOut4,
+                           lengthInBytes);
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_n_buffer_sse(const kasumi_key_sched_t *pKeySchedule,
+                       const uint64_t IV[],
+                       const void * const pDataIn[], void *pDataOut[],
+                       const uint32_t dataLen[], const uint32_t dataCount)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+        uint32_t numLeft = dataCount;
+        const uint64_t *IVPtr;
+        const void * const *pDataInPtr;
+        void **pDataOutPtr;
+        const uint32_t *dataLenPtr;
+        uint32_t i = 0;
+        uint32_t numBuffs;
+
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pKeySchedule == NULL || pDataIn == NULL || pDataOut == NULL ||
+            dataLen == NULL || IV == NULL)
+                return;
+
+        for (i = 0; i < dataCount; i++) {
+                /* Check for NULL pointers */
+                if (pDataIn[i] == NULL || pDataOut[i] == NULL)
+                        return;
+
+                /* Check input data is in range of supported length */
+                if (dataLen[i] == 0 || dataLen[i] > (KASUMI_MAX_LEN / CHAR_BIT))
+                        return;
+        }
+#endif
+
+        i = 0;
+
+        /* KASUMI F8 n buffer function can handle up to 16 buffers */
+        while (numLeft > 0) {
+                IVPtr = &IV[i];
+                pDataInPtr = &pDataIn[i];
+                pDataOutPtr = &pDataOut[i];
+                dataLenPtr = &dataLen[i];
+                numBuffs = (numLeft > 16) ? 16 : numLeft;
+
+                kasumi_f8_n_buffer(pKeySchedule, IVPtr, pDataInPtr, pDataOutPtr,
+                                   dataLenPtr, numBuffs);
+                i += numBuffs;
+                numLeft -= numBuffs;
+        }
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+
+void
+kasumi_f9_1_buffer_sse(const kasumi_key_sched_t *pCtx, const void *pBufferIn,
+                       const uint32_t lengthInBytes, void *pDigest)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL)
+                return;
+
+        /* Check input data is in range of supported length */
+        if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+                return;
+#endif
+        kasumi_f9_1_buffer(pCtx, pBufferIn, lengthInBytes, pDigest);
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f9_1_buffer_user_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+                            const void *pBufferIn, const uint32_t lengthInBits,
+                            void *pDigest, const uint32_t direction)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL)
+                return;
+
+        /* Check input data is in range of supported length */
+        if (lengthInBits == 0 || lengthInBits > KASUMI_MAX_LEN)
+                return;
+#endif
+        kasumi_f9_1_buffer_user(pCtx, IV, pBufferIn, lengthInBits,
+                                pDigest, direction);
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+int
+kasumi_init_f8_key_sched_sse(const void *const pKey,
+                             kasumi_key_sched_t *pCtx)
+{
+        return kasumi_init_f8_key_sched(pKey, pCtx);
+}
+
+int
+kasumi_init_f9_key_sched_sse(const void *const pKey,
+                             kasumi_key_sched_t *pCtx)
+{
+        return kasumi_init_f9_key_sched(pKey, pCtx);
+}
+
+size_t
+kasumi_key_sched_size_sse(void)
+{
+        return kasumi_key_sched_size();
+}
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm
new file mode 100644
index 000000000..305c80342
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4
+%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_sse
+%include "sse/mb_mgr_aes_flush_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm
new file mode 100644
index 000000000..c9129e758
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4
+%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_sse
+%include "sse/mb_mgr_aes_submit_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm
new file mode 100644
index 000000000..2c8afece9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4
+%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_sse
+%include "sse/mb_mgr_aes_flush_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm
new file mode 100644
index 000000000..55f7767f4
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4
+%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_sse
+%include "sse/mb_mgr_aes_submit_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_ccm_auth_submit_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_ccm_auth_submit_flush_sse.asm
new file mode 100644
index 000000000..7aca39f25
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_ccm_auth_submit_flush_sse.asm
@@ -0,0 +1,518 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+%include "include/memcpy.asm"
+
+%ifndef AES128_CBC_MAC
+
+%define AES128_CBC_MAC aes128_cbc_mac_x4
+%define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_sse
+%define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_sse
+
+%endif
+
+extern AES128_CBC_MAC
+
+section .data
+default rel
+
+align 16
+len_masks:
+        dq 0x000000000000FFFF, 0x0000000000000000
+        dq 0x00000000FFFF0000, 0x0000000000000000
+        dq 0x0000FFFF00000000, 0x0000000000000000
+        dq 0xFFFF000000000000, 0x0000000000000000
+counter_mask:
+	dq 0xFFFFFFFFFFFFFF07, 0x0000FFFFFFFFFFFF
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%define NROUNDS 9 ; AES-CCM-128
+%ifdef LINUX
+%define arg1    rdi
+%define arg2    rsi
+%else
+%define arg1    rcx
+%define arg2    rdx
+%endif
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define job_rax          rax
+%define tmp4             rax
+%define auth_len_aad     rax
+
+%define min_idx          rbp
+%define flags            rbp
+
+%define lane             r8
+
+%define iv_len           r9
+%define auth_len         r9
+
+%define aad_len          r10
+%define init_block_addr  r11
+
+%define unused_lanes     rbx
+%define r                rbx
+
+%define tmp              r12
+%define tmp2             r13
+%define tmp3             r14
+
+%define good_lane        r15
+%define min_job          r15
+
+%define init_block0      xmm0
+%define ccm_lens         xmm1
+%define min_len_idx      xmm2
+%define xtmp0            xmm3
+%define xtmp1            xmm4
+%define xtmp2            xmm5
+%define xtmp3            xmm6
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save:      resq    8
+_rsp_save:      resq    1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+%macro  ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0  %2
+
+                pxor           %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+                aesenc         %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+                aesenclast     %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+;;; ===========================================================================
+;;; AES CCM auth job submit & flush
+;;; ===========================================================================
+;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
+%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_SSE 1
+%define %%SUBMIT_FLUSH %1
+
+        mov     rax, rsp
+        sub     rsp, STACK_size
+        and     rsp, -16
+
+        mov     [rsp + _gpr_save + 8*0], rbx
+        mov     [rsp + _gpr_save + 8*1], rbp
+        mov     [rsp + _gpr_save + 8*2], r12
+        mov     [rsp + _gpr_save + 8*3], r13
+        mov     [rsp + _gpr_save + 8*4], r14
+        mov     [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+        mov     [rsp + _gpr_save + 8*6], rsi
+        mov     [rsp + _gpr_save + 8*7], rdi
+%endif
+        mov     [rsp + _rsp_save], rax  ; original SP
+
+        ;; Find free lane
+        mov     unused_lanes, [state + _aes_ccm_unused_lanes]
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+
+        mov     lane, unused_lanes
+        and     lane, 15
+        shr     unused_lanes, 4
+        mov     [state + _aes_ccm_unused_lanes], unused_lanes
+
+        ;; Copy job info into lane
+        mov     [state + _aes_ccm_job_in_lane + lane*8], job
+        ;; Copy keys into lane args
+        mov     tmp, [job + _aes_enc_key_expanded]
+        mov     [state + _aes_ccm_args_keys + lane*8], tmp
+        ;; init_done = 0
+        mov     word [state + _aes_ccm_init_done + lane*2], 0
+        lea     tmp, [lane * 8]
+
+        pxor    init_block0, init_block0
+        movdqa  [state + _aes_ccm_args_IV + tmp*2], init_block0
+
+        ;; Prepare initial Block 0 for CBC-MAC-128
+
+        ;; Byte 0: flags with L' and M' (AAD later)
+        ;; Calculate L' = 15 - IV length - 1 = 14 - IV length
+        mov     flags, 14
+        mov     iv_len, [job + _iv_len_in_bytes]
+        sub     flags, iv_len
+        ;; Calculate M' = (Digest length - 2) / 2
+        mov     tmp, [job + _auth_tag_output_len_in_bytes]
+        sub     tmp, 2
+
+        shl     tmp, 2 ; M' << 3 (combine 1xshr, to div by 2, and 3xshl)
+        or      flags, tmp
+
+        ;; Bytes 1 - 13: Nonce (7 - 13 bytes long)
+
+        ;; Bytes 1 - 7 are always copied (first 7 bytes)
+        mov     tmp, [job + _iv]
+        pinsrb  init_block0, [tmp], 1
+        pinsrw  init_block0, [tmp + 1], 1
+        pinsrd  init_block0, [tmp + 3], 1
+
+        cmp     iv_len, 7
+        je      %%_finish_nonce_move
+
+        cmp     iv_len, 8
+        je      %%_iv_length_8
+        cmp     iv_len, 9
+        je      %%_iv_length_9
+        cmp     iv_len, 10
+        je      %%_iv_length_10
+        cmp     iv_len, 11
+        je      %%_iv_length_11
+        cmp     iv_len, 12
+        je      %%_iv_length_12
+
+        ;; Bytes 8 - 13
+%%_iv_length_13:
+        pinsrb init_block0, [tmp + 12], 13
+%%_iv_length_12:
+        pinsrb init_block0, [tmp + 11], 12
+%%_iv_length_11:
+        pinsrd init_block0, [tmp + 7], 2
+        jmp     %%_finish_nonce_move
+%%_iv_length_10:
+        pinsrb init_block0, [tmp + 9], 10
+%%_iv_length_9:
+        pinsrb init_block0, [tmp + 8], 9
+%%_iv_length_8:
+        pinsrb init_block0, [tmp + 7], 8
+
+%%_finish_nonce_move:
+
+        ;; Bytes 14 & 15 (message length), in Big Endian
+        mov     ax, [job + _msg_len_to_hash_in_bytes]
+        xchg    al, ah
+        pinsrw  init_block0, ax, 7
+
+        mov     aad_len, [job + _cbcmac_aad_len]
+        ;; Initial length to authenticate (Block 0)
+        mov     auth_len, 16
+        ;; Length to authenticate (Block 0 + len(AAD) (2B) + AAD padded,
+        ;; so length is multiple of 64B)
+        lea     auth_len_aad, [aad_len + (2 + 15) + 16]
+        and     auth_len_aad, -16
+
+        or      aad_len, aad_len
+        cmovne  auth_len, auth_len_aad
+        ;; Update lengths to authenticate and find min length
+        movdqa  ccm_lens, [state + _aes_ccm_lens]
+        XPINSRW ccm_lens, xtmp0, tmp2, lane, auth_len, scale_x16
+        movdqa  [state + _aes_ccm_lens], ccm_lens
+        phminposuw min_len_idx, ccm_lens
+
+        mov     tmp, lane
+        shl     tmp, 6
+        lea     init_block_addr, [state + _aes_ccm_init_blocks + tmp]
+        or      aad_len, aad_len
+        je      %%_aad_complete
+
+        or      flags, (1 << 6) ; Set Adata bit in flags
+
+        ;; Copy AAD
+        ;; Set all 0s in last block (padding)
+        lea     tmp, [init_block_addr + auth_len]
+        sub     tmp, 16
+        pxor    xtmp0, xtmp0
+        movdqa  [tmp], xtmp0
+
+        ;; Start copying from second block
+        lea     tmp, [init_block_addr+16]
+        mov     rax, aad_len
+        xchg    al, ah
+        mov     [tmp], ax
+        add     tmp, 2
+        mov     tmp2, [job + _cbcmac_aad]
+        memcpy_sse_64_1 tmp, tmp2, aad_len, tmp3, tmp4, xtmp0, xtmp1, xtmp2, xtmp3
+
+%%_aad_complete:
+
+        ;; Finish Block 0 with Byte 0
+        pinsrb  init_block0, BYTE(flags), 0
+        movdqa  [init_block_addr], init_block0
+
+        ;; args.in[lane] = &initial_block
+        mov     [state + _aes_ccm_args_in + lane * 8], init_block_addr
+
+        cmp     byte [state + _aes_ccm_unused_lanes], 0xf
+        jne     %%_return_null
+
+%else ; end SUBMIT
+
+        ;; Check at least one job
+        bt      unused_lanes, 19
+        jc      %%_return_null
+
+        ;; Find a lane with a non-null job
+        xor     good_lane, good_lane
+        cmp     qword [state + _aes_ccm_job_in_lane + 1*8], 0
+        cmovne  good_lane, [rel one]
+        cmp     qword [state + _aes_ccm_job_in_lane + 2*8], 0
+        cmovne  good_lane, [rel two]
+        cmp     qword [state + _aes_ccm_job_in_lane + 3*8], 0
+        cmovne  good_lane, [rel three]
+
+        ; Copy good_lane to empty lanes
+        movzx   tmp,  word [state + _aes_ccm_init_done + good_lane*2]
+        mov     tmp2, [state + _aes_ccm_args_in + good_lane*8]
+        mov     tmp3, [state + _aes_ccm_args_keys + good_lane*8]
+        shl     good_lane, 4 ; multiply by 16
+        movdqa  xtmp0, [state + _aes_ccm_args_IV + good_lane]
+        movdqa  ccm_lens, [state + _aes_ccm_lens]
+
+%assign I 0
+%rep 4
+        cmp     qword [state + _aes_ccm_job_in_lane + I*8], 0
+        jne     APPEND(skip_,I)
+        por     ccm_lens, [rel len_masks + 16*I]
+        mov     [state + _aes_ccm_init_done + I*2], WORD(tmp)
+        mov     [state + _aes_ccm_args_in + I*8], tmp2
+        mov     [state + _aes_ccm_args_keys + I*8], tmp3
+        movdqa  [state + _aes_ccm_args_IV + I*16], xtmp0
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+        movdqa  [state + _aes_ccm_lens], ccm_lens
+        ;; Find min length
+        phminposuw min_len_idx, ccm_lens
+
+%endif ; end FLUSH
+
+%%_ccm_round:
+        pextrw  len2, min_len_idx, 0    ; min value
+        pextrw  min_idx, min_len_idx, 1         ; min index (0...3)
+
+        mov     min_job, [state + _aes_ccm_job_in_lane + min_idx*8]
+
+        or      len2, len2
+        je      %%_len_is_0
+        ;; subtract min length from all lengths
+        pshuflw min_len_idx, min_len_idx, 0 ; broadcast min length
+        psubw   ccm_lens, min_len_idx
+        movdqa  [state + _aes_ccm_lens], ccm_lens
+
+        ; "state" and "args" are the same address, arg1
+        ; len2 is arg2
+        call    AES128_CBC_MAC
+        ; state and min_idx are intact
+
+%%_len_is_0:
+
+        movzx   tmp, WORD [state + _aes_ccm_init_done + min_idx*2]
+        cmp     WORD(tmp), 0
+        je      %%_prepare_full_blocks_to_auth
+        cmp     WORD(tmp), 1
+        je      %%_prepare_partial_block_to_auth
+
+%%_encrypt_digest:
+
+        ;; Set counter block 0 (reusing previous initial block 0)
+        mov     tmp, min_idx
+        shl     tmp, 3
+        movdqa  init_block0, [state + _aes_ccm_init_blocks + tmp * 8]
+
+        pand   init_block0, [rel counter_mask]
+
+        mov     tmp2, [state + _aes_ccm_args_keys + tmp]
+        ENCRYPT_SINGLE_BLOCK tmp2, init_block0
+        pxor    init_block0, [state + _aes_ccm_args_IV + tmp * 2]
+
+        ;; Copy Mlen bytes into auth_tag_output (Mlen = 4,6,8,10,12,14,16)
+        mov     min_job, [state + _aes_ccm_job_in_lane + tmp]
+        mov     tmp3, [min_job + _auth_tag_output_len_in_bytes]
+        mov     tmp2, [min_job + _auth_tag_output]
+
+        simd_store_sse tmp2, init_block0, tmp3, tmp, rax
+
+%%_update_lanes:
+        ; Update unused lanes
+        mov     unused_lanes, [state + _aes_ccm_unused_lanes]
+        shl     unused_lanes, 4
+        or      unused_lanes, min_idx
+        mov     [state + _aes_ccm_unused_lanes], unused_lanes
+
+        ; Set return job
+        mov     job_rax, min_job
+
+        mov     qword [state + _aes_ccm_job_in_lane + min_idx*8], 0
+        or      dword [job_rax + _status], STS_COMPLETED_HMAC
+
+%ifdef SAFE_DATA
+        pxor    xtmp0, xtmp0
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+        shl     min_idx, 3
+        ;; Clear digest (in memory for CBC IV), counter block 0 and AAD of returned job
+        movdqa  [state + _aes_ccm_args_IV + min_idx * 2],          xtmp0
+        movdqa  [state + _aes_ccm_init_blocks + min_idx * 8],      xtmp0
+        movdqa  [state + _aes_ccm_init_blocks + min_idx * 8 + 16], xtmp0
+        movdqa  [state + _aes_ccm_init_blocks + min_idx * 8 + 32], xtmp0
+        movdqa  [state + _aes_ccm_init_blocks + min_idx * 8 + 48], xtmp0
+        mov     qword [state + _aes_ccm_args_keys + min_idx], 0
+%else
+        ;; Clear digest (in memory for CBC IV), counter block 0 and AAD
+        ;; of returned job and "NULL lanes"
+%assign I 0
+%rep 4
+        cmp     qword [state + _aes_ccm_job_in_lane + I*8], 0
+        jne     APPEND(skip_clear_,I)
+        movdqa  [state + _aes_ccm_args_IV + I*16],          xtmp0
+        movdqa  [state + _aes_ccm_init_blocks + I*64],      xtmp0
+        movdqa  [state + _aes_ccm_init_blocks + I*64 + 16], xtmp0
+        movdqa  [state + _aes_ccm_init_blocks + I*64 + 32], xtmp0
+        movdqa  [state + _aes_ccm_init_blocks + I*64 + 48], xtmp0
+        mov     qword [state + _aes_ccm_args_keys + I*8], 0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SUBMIT
+%endif ;; SAFE_DATA
+
+%%_return:
+        mov     rbx, [rsp + _gpr_save + 8*0]
+        mov     rbp, [rsp + _gpr_save + 8*1]
+        mov     r12, [rsp + _gpr_save + 8*2]
+        mov     r13, [rsp + _gpr_save + 8*3]
+        mov     r14, [rsp + _gpr_save + 8*4]
+        mov     r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+        mov     rsi, [rsp + _gpr_save + 8*6]
+        mov     rdi, [rsp + _gpr_save + 8*7]
+%endif
+        mov     rsp, [rsp + _rsp_save]  ; original SP
+        ret
+
+%%_return_null:
+        xor     job_rax, job_rax
+        jmp     %%_return
+
+%%_prepare_full_blocks_to_auth:
+
+        cmp     dword [min_job + _cipher_direction], 2 ; DECRYPT
+        je      %%_decrypt
+
+%%_encrypt:
+        mov     tmp, [min_job + _src]
+        add     tmp, [min_job + _hash_start_src_offset_in_bytes]
+        jmp     %%_set_init_done_1
+
+%%_decrypt:
+        mov     tmp, [min_job + _dst]
+
+%%_set_init_done_1:
+        mov     [state + _aes_ccm_args_in + min_idx*8], tmp
+        mov     word [state + _aes_ccm_init_done + min_idx*2], 1
+
+        ; Check if there are full blocks to hash
+        mov     tmp, [min_job + _msg_len_to_hash_in_bytes]
+        and     tmp, -16
+        je      %%_prepare_partial_block_to_auth
+
+        ;; Update lengths to authenticate and find min length
+        movdqa  ccm_lens, [state + _aes_ccm_lens]
+        XPINSRW ccm_lens, xtmp0, tmp2, min_idx, tmp, scale_x16
+        phminposuw min_len_idx, ccm_lens
+        movdqa  [state + _aes_ccm_lens], ccm_lens
+
+        jmp     %%_ccm_round
+
+%%_prepare_partial_block_to_auth:
+        ; Check if partial block needs to be hashed
+        mov     auth_len, [min_job + _msg_len_to_hash_in_bytes]
+        and     auth_len, 15
+        je      %%_encrypt_digest
+
+        mov     word [state + _aes_ccm_init_done + min_idx * 2], 2
+        ;; Update lengths to authenticate and find min length
+        movdqa  ccm_lens, [state + _aes_ccm_lens]
+        XPINSRW ccm_lens, xtmp0, tmp2, min_idx, 16, scale_x16
+        phminposuw min_len_idx, ccm_lens
+        movdqa  [state + _aes_ccm_lens], ccm_lens
+
+        mov     tmp2, min_idx
+        shl     tmp2, 6
+        add     tmp2, 16 ; pb[AES_BLOCK_SIZE]
+        lea     init_block_addr, [state + _aes_ccm_init_blocks + tmp2]
+        mov     tmp2, [state + _aes_ccm_args_in + min_idx * 8]
+
+        simd_load_sse_15_1 xtmp0, tmp2, auth_len
+
+%%_finish_partial_block_copy:
+        movdqa  [init_block_addr], xtmp0
+        mov     [state + _aes_ccm_args_in + min_idx * 8], init_block_addr
+
+        jmp     %%_ccm_round
+%endmacro
+
+
+align 64
+; JOB_AES_HMAC * submit_job_aes_ccm_auth_sse(MB_MGR_CCM_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal)
+SUBMIT_JOB_AES_CCM_AUTH:
+        GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_SSE SUBMIT
+
+; JOB_AES_HMAC * flush_job_aes_ccm_auth_sse(MB_MGR_CCM_OOO *state)
+; arg 1 : state
+MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal)
+FLUSH_JOB_AES_CCM_AUTH:
+        GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_SSE FLUSH
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm
new file mode 100644
index 000000000..01c6315bd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm
@@ -0,0 +1,502 @@
+;;
+;; Copyright (c) 2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%ifndef AES128_CBC_MAC
+
+%define AES128_CBC_MAC aes128_cbc_mac_x4
+%define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_sse
+%define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_sse
+
+%endif
+
+extern AES128_CBC_MAC
+
+section .data
+default rel
+
+align 16
+len_masks:
+	;ddq 0x0000000000000000000000000000FFFF
+	dq 0x000000000000FFFF, 0x0000000000000000
+	;ddq 0x000000000000000000000000FFFF0000
+	dq 0x00000000FFFF0000, 0x0000000000000000
+	;ddq 0x00000000000000000000FFFF00000000
+	dq 0x0000FFFF00000000, 0x0000000000000000
+	;ddq 0x0000000000000000FFFF000000000000
+	dq 0xFFFF000000000000, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+%define job_rax          rax
+
+; idx needs to be in rbp
+%define len              rbp
+%define idx              rbp
+%define tmp              rbp
+
+%define lane             r8
+
+%define iv               r9
+%define m_last           r10
+%define n                r11
+
+%define unused_lanes     rbx
+%define r                rbx
+
+%define tmp3             r12
+%define tmp4             r13
+%define tmp2             r14
+
+%define good_lane        r15
+%define rbits            r15
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save:	resq	8
+_rsp_save:	resq	1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+;;; ===========================================================================
+;;; AES CMAC job submit & flush
+;;; ===========================================================================
+;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
+%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE 1
+%define %%SUBMIT_FLUSH %1
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _gpr_save + 8*2], r12
+	mov	[rsp + _gpr_save + 8*3], r13
+	mov	[rsp + _gpr_save + 8*4], r14
+	mov	[rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*6], rsi
+	mov	[rsp + _gpr_save + 8*7], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+        ;; Find free lane
+ 	mov	unused_lanes, [state + _aes_cmac_unused_lanes]
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+
+ 	mov	lane, unused_lanes
+        and	lane, 0xF
+ 	shr	unused_lanes, 4
+ 	mov	[state + _aes_cmac_unused_lanes], unused_lanes
+
+        ;; Copy job info into lane
+ 	mov	[state + _aes_cmac_job_in_lane + lane*8], job
+        ;; Copy keys into lane args
+ 	mov	tmp, [job + _key_expanded]
+ 	mov	[state + _aes_cmac_args_keys + lane*8], tmp
+        mov     tmp, lane
+        shl     tmp, 4  ; lane*16
+
+        ;; Zero IV to store digest
+        pxor    xmm0, xmm0
+        movdqa  [state + _aes_cmac_args_IV + tmp], xmm0
+
+        lea     m_last, [state + _aes_cmac_scratch + tmp]
+
+        ;; calculate len
+        ;; convert bits to bytes (message length in bits for CMAC)
+        mov     len, [job + _msg_len_to_hash_in_bits]
+        mov     rbits, len
+        add     len, 7      ; inc len if there are remainder bits
+        shr     len, 3
+        and     rbits, 7
+
+        ;; Check at least 1 or more blocks (get n)
+        mov     n, len
+        add     n, 0xf
+        shr     n, 4
+
+        ;; Check for partial block
+        mov     r, len
+        and     r, 0xf
+
+        or      n, n   ; check one or more blocks?
+        jz      %%_lt_one_block
+
+        ;; One or more blocks, potentially partial
+        mov     word [state + _aes_cmac_init_done + lane*2], 0
+
+        mov     tmp2, [job + _src]
+        add     tmp2, [job + _hash_start_src_offset_in_bytes]
+        mov     [state + _aes_cmac_args_in + lane*8], tmp2
+
+        ;; len = (n-1)*16
+        lea     tmp2, [n - 1]
+        shl     tmp2, 4
+        movdqa  xmm0, [state + _aes_cmac_lens]
+        XPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16
+        movdqa  [state + _aes_cmac_lens], xmm0
+
+        ;; check remainder bits
+        or      rbits, rbits
+        jnz     %%_not_complete_block_3gpp
+
+        ;; check if complete block
+        or      r, r
+        jz      %%_complete_block
+
+%%_not_complete_block:
+        ;; M_last = padding(M_n) XOR K2
+        lea     tmp, [rel padding_0x80_tab16 + 16]
+        sub     tmp, r
+        movdqu  xmm0, [tmp]
+        movdqa  [m_last], xmm0
+
+        mov     tmp, [job + _src]
+        add     tmp, [job + _hash_start_src_offset_in_bytes]
+        lea     tmp3, [n - 1]
+        shl     tmp3, 4
+        add     tmp, tmp3
+
+        memcpy_sse_16 m_last, tmp, r, tmp4, iv
+
+        ;; src + n + r
+        mov     tmp3, [job + _skey2]
+        movdqa  xmm1, [m_last]
+        movdqu  xmm0, [tmp3]
+        pxor    xmm0, xmm1
+        movdqa  [m_last], xmm0
+
+%%_step_5:
+        ;; Find min length
+        movdqa  xmm0, [state + _aes_cmac_lens]
+        phminposuw xmm1, xmm0
+
+        cmp	byte [state + _aes_cmac_unused_lanes], 0xf
+        jne	%%_return_null
+
+%else ; end SUBMIT
+
+        ;; Check at least one job
+        bt      unused_lanes, 19
+	jc      %%_return_null
+
+      	;; Find a lane with a non-null job
+	xor	good_lane, good_lane
+	cmp	qword [state + _aes_cmac_job_in_lane + 1*8], 0
+	cmovne	good_lane, [rel one]
+	cmp	qword [state + _aes_cmac_job_in_lane + 2*8], 0
+	cmovne	good_lane, [rel two]
+	cmp	qword [state + _aes_cmac_job_in_lane + 3*8], 0
+	cmovne	good_lane, [rel three]
+
+	; Copy good_lane to empty lanes
+	mov	tmp2, [state + _aes_cmac_args_in + good_lane*8]
+	mov	tmp3, [state + _aes_cmac_args_keys + good_lane*8]
+	shl	good_lane, 4 ; multiply by 16
+	movdqa	xmm2, [state + _aes_cmac_args_IV + good_lane]
+	movdqa	xmm0, [state + _aes_cmac_lens]
+
+%assign I 0
+%rep 4
+	cmp	qword [state + _aes_cmac_job_in_lane + I*8], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _aes_cmac_args_in + I*8], tmp2
+	mov	[state + _aes_cmac_args_keys + I*8], tmp3
+	movdqa	[state + _aes_cmac_args_IV + I*16], xmm2
+	por	xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+        ;; Find min length
+        phminposuw xmm1, xmm0
+
+%endif ; end FLUSH
+
+%%_cmac_round:
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+        cmp	len2, 0
+	je	%%_len_is_0
+        pshuflw	xmm1, xmm1, 0
+	psubw	xmm0, xmm1
+	movdqa	[state + _aes_cmac_lens], xmm0
+
+        ; "state" and "args" are the same address, arg1
+	; len2 is arg2
+	call    AES128_CBC_MAC
+	; state and idx are intact
+
+        movdqa  xmm0, [state + _aes_cmac_lens]  ; preload lens
+%%_len_is_0:
+        ; Check if job complete
+        test    word [state + _aes_cmac_init_done + idx*2], 0xffff
+        jnz     %%_copy_complete_digest
+
+        ; Finish step 6
+        mov     word [state + _aes_cmac_init_done + idx*2], 1
+
+        XPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16
+        movdqa  [state + _aes_cmac_lens], xmm0
+
+        phminposuw xmm1, xmm0 ; find min length
+
+        mov     tmp3, idx
+        shl     tmp3, 4  ; idx*16
+        lea     m_last, [state + _aes_cmac_scratch + tmp3]
+        mov     [state + _aes_cmac_args_in + idx*8], m_last
+
+        jmp     %%_cmac_round
+
+%%_copy_complete_digest:
+        ; Job complete, copy digest to AT output
+ 	mov	job_rax, [state + _aes_cmac_job_in_lane + idx*8]
+
+        mov     tmp4, idx
+        shl     tmp4, 4
+        lea     tmp3, [state + _aes_cmac_args_IV + tmp4]
+        mov     tmp4, [job_rax + _auth_tag_output_len_in_bytes]
+        mov     tmp2, [job_rax + _auth_tag_output]
+
+        cmp     tmp4, 16
+        jne     %%_ne_16_copy
+
+        ;; 16 byte AT copy
+        movdqu  xmm0, [tmp3]
+        movdqu  [tmp2], xmm0
+        jmp     %%_update_lanes
+
+%%_ne_16_copy:
+        memcpy_sse_16 tmp2, tmp3, tmp4, lane, iv
+
+%%_update_lanes:
+        ; Update unused lanes
+        mov	unused_lanes, [state + _aes_cmac_unused_lanes]
+        shl	unused_lanes, 4
+ 	or	unused_lanes, idx
+ 	mov	[state + _aes_cmac_unused_lanes], unused_lanes
+
+        ; Set return job
+        mov	job_rax, [state + _aes_cmac_job_in_lane + idx*8]
+
+ 	mov	qword [state + _aes_cmac_job_in_lane + idx*8], 0
+ 	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+        ;; Clear digest (in memory for IV) and scratch memory of returned job
+        movdqa  [tmp3], xmm0
+
+        shl     idx, 4
+        movdqa  [state + _aes_cmac_scratch + idx], xmm0
+
+%else
+        ;; Clear digest and scratch memory of returned job and "NULL lanes"
+%assign I 0
+%rep 4
+        cmp     qword [state + _aes_cmac_job_in_lane + I*8], 0
+        jne     APPEND(skip_clear_,I)
+        movdqa  [state + _aes_cmac_args_IV + I*16], xmm0
+        movdqa  [state + _aes_cmac_scratch + I*16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif ;; SUBMIT
+
+%endif ;; SAFE_DATA
+
+%%_return:
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+	mov	r13, [rsp + _gpr_save + 8*3]
+	mov	r14, [rsp + _gpr_save + 8*4]
+	mov	r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*6]
+	mov	rdi, [rsp + _gpr_save + 8*7]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+	ret
+
+%%_return_null:
+	xor	job_rax, job_rax
+	jmp	%%_return
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+%%_complete_block:
+
+        ;; Block size aligned
+        mov     tmp2, [job + _src]
+        add     tmp2, [job + _hash_start_src_offset_in_bytes]
+        lea     tmp3, [n - 1]
+        shl     tmp3, 4
+        add     tmp2, tmp3
+
+        ;; M_last = M_n XOR K1
+        mov     tmp3, [job + _skey1]
+        movdqu  xmm0, [tmp3]
+        movdqu  xmm1, [tmp2]
+        pxor    xmm0, xmm1
+        movdqa  [m_last], xmm0
+
+        jmp     %%_step_5
+
+%%_lt_one_block:
+        ;; Single partial block
+        mov     word [state + _aes_cmac_init_done + lane*2], 1
+        mov     [state + _aes_cmac_args_in + lane*8], m_last
+
+        movdqa  xmm0, [state + _aes_cmac_lens]
+        XPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16
+        movdqa  [state + _aes_cmac_lens], xmm0
+
+        mov     n, 1
+        jmp     %%_not_complete_block
+
+%%_not_complete_block_3gpp:
+        ;; bit pad last block
+        ;; xor with skey2
+        ;; copy to m_last
+
+        ;; load pointer to src
+        mov     tmp, [job + _src]
+        add     tmp, [job + _hash_start_src_offset_in_bytes]
+        lea     tmp3, [n - 1]
+        shl     tmp3, 4
+        add     tmp, tmp3
+
+        ;; check if partial block
+        or      r, r
+        jz      %%_load_full_block_3gpp
+
+        simd_load_sse_15_1 xmm0, tmp, r
+        dec     r
+
+%%_update_mlast_3gpp:
+        ;; set last byte padding mask
+        ;; shift into correct xmm idx
+
+        ;; save and restore rcx on windows
+%ifndef LINUX
+	mov	tmp, rcx
+%endif
+        mov     rcx, rbits
+        mov     tmp3, 0xff
+        shr     tmp3, cl
+        movq    xmm2, tmp3
+        XPSLLB  xmm2, r, xmm1, tmp2
+
+        ;; pad final byte
+        pandn   xmm2, xmm0
+%ifndef LINUX
+	mov	rcx, tmp
+%endif
+        ;; set OR mask to pad final bit
+        mov     tmp2, tmp3
+        shr     tmp2, 1
+        xor     tmp2, tmp3 ; XOR to get OR mask
+        movq    xmm3, tmp2
+        ;; xmm1 contains shift table from previous shift
+        pshufb  xmm3, xmm1
+
+        ;; load skey2 address
+        mov     tmp3, [job + _skey2]
+        movdqu  xmm1, [tmp3]
+
+        ;; set final padding bit
+        por     xmm2, xmm3
+
+        ;; XOR last partial block with skey2
+        ;; update mlast
+        pxor    xmm2, xmm1
+        movdqa  [m_last], xmm2
+
+        jmp     %%_step_5
+
+%%_load_full_block_3gpp:
+        movdqu  xmm0, [tmp]
+        mov     r, 0xf
+        jmp     %%_update_mlast_3gpp
+%endif
+%endmacro
+
+
+align 64
+; JOB_AES_HMAC * submit_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal)
+SUBMIT_JOB_AES_CMAC_AUTH:
+        GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE SUBMIT
+
+; JOB_AES_HMAC * flush_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state)
+; arg 1 : state
+MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal)
+FLUSH_JOB_AES_CMAC_AUTH:
+        GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE FLUSH
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm
new file mode 100644
index 000000000..0066aff9f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm
@@ -0,0 +1,217 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+
+%ifndef AES_CBC_ENC_X4
+%define AES_CBC_ENC_X4 aes_cbc_enc_128_x4
+%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_sse
+%endif
+
+; void AES_CBC_ENC_X4(AES_ARGS *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X4
+
+section .data
+default rel
+
+align 16
+len_masks:
+	;ddq 0x0000000000000000000000000000FFFF
+	dq 0x000000000000FFFF, 0x0000000000000000
+	;ddq 0x000000000000000000000000FFFF0000
+	dq 0x00000000FFFF0000, 0x0000000000000000
+	;ddq 0x00000000000000000000FFFF00000000
+	dq 0x0000FFFF00000000, 0x0000000000000000
+	;ddq 0x0000000000000000FFFF000000000000
+	dq 0xFFFF000000000000, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+%define job_rax          rax
+
+%if 1
+%define unused_lanes     rbx
+%define tmp1             rbx
+
+%define good_lane        rdx
+%define iv               rdx
+
+%define tmp2             rax
+
+; idx needs to be in rbp
+%define tmp              rbp
+%define idx              rbp
+
+%define tmp3             r8
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save:	resq	8
+_rsp_save:	resq	1
+endstruc
+
+; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal)
+FLUSH_JOB_AES_ENC:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _gpr_save + 8*2], r12
+	mov	[rsp + _gpr_save + 8*3], r13
+	mov	[rsp + _gpr_save + 8*4], r14
+	mov	[rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*6], rsi
+	mov	[rsp + _gpr_save + 8*7], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+	; check for empty
+	mov	unused_lanes, [state + _aes_unused_lanes]
+	bt	unused_lanes, 32+7
+	jc	return_null
+
+	; find a lane with a non-null job
+	xor	good_lane, good_lane
+	cmp	qword [state + _aes_job_in_lane + 1*8], 0
+	cmovne	good_lane, [rel one]
+	cmp	qword [state + _aes_job_in_lane + 2*8], 0
+	cmovne	good_lane, [rel two]
+	cmp	qword [state + _aes_job_in_lane + 3*8], 0
+	cmovne	good_lane, [rel three]
+
+	; copy good_lane to empty lanes
+	mov	tmp1, [state + _aes_args_in + good_lane*8]
+	mov	tmp2, [state + _aes_args_out + good_lane*8]
+	mov	tmp3, [state + _aes_args_keys + good_lane*8]
+	shl	good_lane, 4 ; multiply by 16
+	movdqa	xmm2, [state + _aes_args_IV + good_lane]
+	movdqa	xmm0, [state + _aes_lens]
+
+%assign I 0
+%rep 4
+	cmp	qword [state + _aes_job_in_lane + I*8], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _aes_args_in + I*8], tmp1
+	mov	[state + _aes_args_out + I*8], tmp2
+	mov	[state + _aes_args_keys + I*8], tmp3
+	movdqa	[state + _aes_args_IV + I*16], xmm2
+	por	xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	phminposuw	xmm1, xmm0
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshuflw	xmm1, xmm1, 0
+	psubw	xmm0, xmm1
+	movdqa	[state + _aes_lens], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	AES_CBC_ENC_X4
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	mov	job_rax, [state + _aes_job_in_lane + idx*8]
+	mov	unused_lanes, [state + _aes_unused_lanes]
+	mov	qword [state + _aes_job_in_lane + idx*8], 0
+	or	dword [job_rax + _status], STS_COMPLETED_AES
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _aes_unused_lanes], unused_lanes
+%ifdef SAFE_DATA
+        ;; Clear IVs of returned job and "NULL lanes"
+        pxor    xmm0, xmm0
+%assign I 0
+%rep 4
+	cmp	qword [state + _aes_job_in_lane + I*8], 0
+	jne	APPEND(skip_clear_,I)
+	movdqa	[state + _aes_args_IV + I*16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif
+
+return:
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+	mov	r13, [rsp + _gpr_save + 8*3]
+	mov	r14, [rsp + _gpr_save + 8*4]
+	mov	r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*6]
+	mov	rdi, [rsp + _gpr_save + 8*7]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm
new file mode 100644
index 000000000..702fb91a4
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm
@@ -0,0 +1,187 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+
+%ifndef AES_CBC_ENC_X4
+
+%define AES_CBC_ENC_X4 aes_cbc_enc_128_x4
+%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_sse
+
+%endif
+
+; void AES_CBC_ENC_X4(AES_ARGS *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X4
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+%define job_rax          rax
+
+%if 1
+; idx needs to be in rbp
+%define len              rbp
+%define idx              rbp
+%define tmp              rbp
+
+%define lane             r8
+
+%define iv               r9
+
+%define unused_lanes     rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save:	resq	8
+_rsp_save:	resq	1
+endstruc
+
+section .text
+
+; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal)
+SUBMIT_JOB_AES_ENC:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _gpr_save + 8*2], r12
+	mov	[rsp + _gpr_save + 8*3], r13
+	mov	[rsp + _gpr_save + 8*4], r14
+	mov	[rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*6], rsi
+	mov	[rsp + _gpr_save + 8*7], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+	mov	unused_lanes, [state + _aes_unused_lanes]
+	movzx	lane, BYTE(unused_lanes)
+	shr	unused_lanes, 8
+	mov	iv, [job + _iv]
+	mov	[state + _aes_unused_lanes], unused_lanes
+
+	mov	[state + _aes_job_in_lane + lane*8], job
+	mov	tmp, [job + _src]
+	add	tmp, [job + _cipher_start_src_offset_in_bytes]
+	movdqu	xmm0, [iv]
+	mov	[state + _aes_args_in + lane*8], tmp
+	mov	tmp, [job + _aes_enc_key_expanded]
+	mov	[state + _aes_args_keys + lane*8], tmp
+	mov	tmp, [job + _dst]
+	mov	[state + _aes_args_out + lane*8], tmp
+	shl	lane, 4	; multiply by 16
+	movdqa	[state + _aes_args_IV + lane], xmm0
+
+        ;; insert len into proper lane
+        mov     len, [job + _msg_len_to_cipher_in_bytes]
+        and     len, -16        ; DOCSIS may pass size unaligned to block size
+
+        movdqa  xmm0, [state + _aes_lens]
+        XPINSRW xmm0, xmm1, tmp, lane, len, no_scale
+        movdqa  [state + _aes_lens], xmm0
+
+	cmp	unused_lanes, 0xff
+	jne	return_null
+
+	; Find min length
+	phminposuw	xmm1, xmm0
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshuflw	xmm1, xmm1, 0
+	psubw	xmm0, xmm1
+	movdqa	[state + _aes_lens], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	AES_CBC_ENC_X4
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	mov	job_rax, [state + _aes_job_in_lane + idx*8]
+	mov	unused_lanes, [state + _aes_unused_lanes]
+	mov	qword [state + _aes_job_in_lane + idx*8], 0
+	or	dword [job_rax + _status], STS_COMPLETED_AES
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _aes_unused_lanes], unused_lanes
+%ifdef SAFE_DATA
+        ;; Clear IV
+        pxor    xmm0, xmm0
+        shl     idx, 3 ; multiply by 8
+        movdqa  [state + _aes_args_IV + idx*2], xmm0
+        mov     qword [state + _aes_args_keys + idx], 0
+%endif
+
+return:
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+	mov	r13, [rsp + _gpr_save + 8*3]
+	mov	r14, [rsp + _gpr_save + 8*4]
+	mov	r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*6]
+	mov	rdi, [rsp + _gpr_save + 8*7]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm
new file mode 100644
index 000000000..6069ce17a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm
@@ -0,0 +1,242 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+
+%ifndef AES_XCBC_X4
+%define AES_XCBC_X4 aes_xcbc_mac_128_x4
+%define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_sse
+%endif
+
+; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_XCBC_X4
+
+section .data
+default rel
+
+align 16
+len_masks:
+	;ddq 0x0000000000000000000000000000FFFF
+	dq 0x000000000000FFFF, 0x0000000000000000
+	;ddq 0x000000000000000000000000FFFF0000
+	dq 0x00000000FFFF0000, 0x0000000000000000
+	;ddq 0x00000000000000000000FFFF00000000
+	dq 0x0000FFFF00000000, 0x0000000000000000
+	;ddq 0x0000000000000000FFFF000000000000
+	dq 0xFFFF000000000000, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+%define job_rax          rax
+
+%if 1
+%define unused_lanes     rbx
+%define tmp1             rbx
+
+%define icv              rdx
+
+%define tmp2             rax
+
+; idx needs to be in rbp
+%define tmp              r10
+%define idx              rbp
+
+%define tmp3             r8
+%define lane_data        r9
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save:	resq	8
+_rsp_save:	resq	1
+endstruc
+
+; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal)
+FLUSH_JOB_AES_XCBC:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _gpr_save + 8*2], r12
+	mov	[rsp + _gpr_save + 8*3], r13
+	mov	[rsp + _gpr_save + 8*4], r14
+	mov	[rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*6], rsi
+	mov	[rsp + _gpr_save + 8*7], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+	; check for empty
+	mov	unused_lanes, [state + _aes_xcbc_unused_lanes]
+	bt	unused_lanes, 32+7
+	jc	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+	cmovne	idx, [rel one]
+	cmp	qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+	cmovne	idx, [rel two]
+	cmp	qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+	cmovne	idx, [rel three]
+
+copy_lane_data:
+	; copy idx to empty lanes
+	mov	tmp1, [state + _aes_xcbc_args_in + idx*8]
+	mov	tmp3, [state + _aes_xcbc_args_keys + idx*8]
+	shl	idx, 4 ; multiply by 16
+	movdqa	xmm2, [state + _aes_xcbc_args_ICV + idx]
+	movdqa	xmm0, [state + _aes_xcbc_lens]
+
+%assign I 0
+%rep 4
+	cmp	qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _aes_xcbc_args_in + I*8], tmp1
+	mov	[state + _aes_xcbc_args_keys + I*8], tmp3
+	movdqa	[state + _aes_xcbc_args_ICV + I*16], xmm2
+	por	xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	movdqa	[state + _aes_xcbc_lens], xmm0
+
+	; Find min length
+	phminposuw	xmm1, xmm0
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshuflw	xmm1, xmm1, 0
+	psubw	xmm0, xmm1
+	movdqa	[state + _aes_xcbc_lens], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	AES_XCBC_X4
+	; state and idx are intact
+
+len_is_0:
+        ; process completed job "idx"
+        imul    lane_data, idx, _XCBC_LANE_DATA_size
+        lea     lane_data, [state + _aes_xcbc_ldata + lane_data]
+        cmp     dword [lane_data + _xcbc_final_done], 0
+        jne     end_loop
+
+        mov     dword [lane_data + _xcbc_final_done], 1
+        mov     word [state + _aes_xcbc_lens + 2*idx], 16
+        lea     tmp, [lane_data + _xcbc_final_block]
+        mov     [state + _aes_xcbc_args_in + 8*idx], tmp
+        jmp     copy_lane_data
+
+end_loop:
+        mov     job_rax, [lane_data + _xcbc_job_in_lane]
+        mov     icv,  [job_rax + _auth_tag_output]
+        mov     unused_lanes, [state + _aes_xcbc_unused_lanes]
+        mov     qword [lane_data + _xcbc_job_in_lane], 0
+        or      dword [job_rax + _status], STS_COMPLETED_HMAC
+        shl     unused_lanes, 8
+        or      unused_lanes, idx
+        shl     idx, 4 ; multiply by 16
+        mov     [state + _aes_xcbc_unused_lanes], unused_lanes
+
+	; copy 12 bytes
+	movdqa	xmm0, [state + _aes_xcbc_args_ICV + idx]
+	movq	[icv], xmm0
+	pextrd	[icv + 8], xmm0, 2
+
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+
+        ;; Clear ICV's and final blocks in returned job and NULL lanes
+%assign I 0
+%rep 4
+        cmp	qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+        jne	APPEND(skip_clear_,I)
+        movdqa	[state + _aes_xcbc_args_ICV + I*16], xmm0
+        lea     lane_data, [state + _aes_xcbc_ldata + (I * _XCBC_LANE_DATA_size)]
+        movdqa  [lane_data + _xcbc_final_block], xmm0
+        movdqa  [lane_data + _xcbc_final_block + 16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif
+return:
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+	mov	r13, [rsp + _gpr_save + 8*3]
+	mov	r14, [rsp + _gpr_save + 8*4]
+	mov	r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*6]
+	mov	rdi, [rsp + _gpr_save + 8*7]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm
new file mode 100644
index 000000000..e61cc07b1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm
@@ -0,0 +1,263 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/const.inc"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%ifndef AES_XCBC_X4
+%define AES_XCBC_X4 aes_xcbc_mac_128_x4
+%define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse
+%endif
+
+; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_XCBC_X4
+
+section .data
+default rel
+
+align 16
+x80:            ;ddq 0x00000000000000000000000000000080
+        dq 0x0000000000000080, 0x0000000000000000
+
+section .text
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+%define job_rax          rax
+
+%if 1
+; idx needs to be in rbp
+%define idx              rbp
+%define last_len         rbp
+
+%define lane             r8
+
+%define icv              r9
+%define p2               r9
+
+%define tmp              r10
+%define len              r11
+%define lane_data        r12
+%define p                r13
+%define tmp2             r14
+
+%define unused_lanes     rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save:	resq	8
+_rsp_save:	resq	1
+endstruc
+
+; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal)
+SUBMIT_JOB_AES_XCBC:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _gpr_save + 8*2], r12
+	mov	[rsp + _gpr_save + 8*3], r13
+	mov	[rsp + _gpr_save + 8*4], r14
+	mov	[rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*6], rsi
+	mov	[rsp + _gpr_save + 8*7], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+	mov	unused_lanes, [state + _aes_xcbc_unused_lanes]
+	movzx	lane, BYTE(unused_lanes)
+	shr	unused_lanes, 8
+	imul	lane_data, lane, _XCBC_LANE_DATA_size
+	lea	lane_data, [state + _aes_xcbc_ldata + lane_data]
+	mov	[state + _aes_xcbc_unused_lanes], unused_lanes
+	mov	len, [job + _msg_len_to_hash_in_bytes]
+	mov	[lane_data + _xcbc_job_in_lane], job
+	mov	dword [lane_data + _xcbc_final_done], 0
+	mov	tmp, [job + _k1_expanded]
+	mov	[state + _aes_xcbc_args_keys + lane*8], tmp
+	mov	p, [job + _src]
+	add	p, [job + _hash_start_src_offset_in_bytes]
+
+	mov	last_len, len
+
+	cmp	len, 16
+	jle	small_buffer
+
+	mov	[state + _aes_xcbc_args_in + lane*8], p
+	add	p, len		; set point to end of data
+
+	and	last_len, 15	; Check lsbs of msg len
+	jnz	slow_copy	; if not 16B mult, do slow copy
+
+fast_copy:
+	movdqu	xmm0, [p - 16]	; load last block M[n]
+        mov     tmp, [job + _k2] ; load K2 address
+        movdqu  xmm1, [tmp]     ; load K2
+        pxor    xmm0, xmm1      ; M[n] XOR K2
+	movdqa	[lane_data + _xcbc_final_block], xmm0
+	sub	len, 16		; take last block off length
+end_fast_copy:
+	pxor	xmm0, xmm0
+	shl	lane, 4	; multiply by 16
+	movdqa	[state + _aes_xcbc_args_ICV + lane], xmm0
+
+        ;; insert len into proper lane
+        movdqa  xmm0, [state + _aes_xcbc_lens]
+        XPINSRW xmm0, xmm1, tmp, lane, len, no_scale
+        movdqa  [state + _aes_xcbc_lens], xmm0
+
+	cmp	unused_lanes, 0xff
+	jne	return_null
+
+start_loop:
+	; Find min length
+	phminposuw	xmm1, xmm0
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshuflw	xmm1, xmm1, 0
+	psubw	xmm0, xmm1
+	movdqa	[state + _aes_xcbc_lens], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	AES_XCBC_X4
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _XCBC_LANE_DATA_size
+	lea	lane_data, [state + _aes_xcbc_ldata + lane_data]
+	cmp	dword [lane_data + _xcbc_final_done], 0
+	jne	end_loop
+
+	mov	dword [lane_data + _xcbc_final_done], 1
+	mov	word [state + _aes_xcbc_lens + 2*idx], 16
+	lea	tmp, [lane_data + _xcbc_final_block]
+	mov	[state + _aes_xcbc_args_in + 8*idx], tmp
+        movdqa	xmm0, [state + _aes_xcbc_lens]
+	jmp	start_loop
+
+end_loop:
+	; process completed job "idx"
+	mov	job_rax, [lane_data + _xcbc_job_in_lane]
+	mov	icv, [job_rax + _auth_tag_output]
+	mov	unused_lanes, [state + _aes_xcbc_unused_lanes]
+	mov	qword [lane_data + _xcbc_job_in_lane], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	shl	idx, 4 ; multiply by 16
+	mov	[state + _aes_xcbc_unused_lanes], unused_lanes
+
+	; copy 12 bytes
+	movdqa	xmm0, [state + _aes_xcbc_args_ICV + idx]
+	movq	[icv], xmm0
+	pextrd	[icv + 8], xmm0, 2
+
+%ifdef SAFE_DATA
+        ;; Clear ICV
+        pxor    xmm0, xmm0
+        movdqa  [state + _aes_xcbc_args_ICV + idx], xmm0
+
+        ;; Clear final block (32 bytes)
+        movdqa  [lane_data + _xcbc_final_block], xmm0
+        movdqa  [lane_data + _xcbc_final_block + 16], xmm0
+%endif
+
+return:
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+	mov	r13, [rsp + _gpr_save + 8*3]
+	mov	r14, [rsp + _gpr_save + 8*4]
+	mov	r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*6]
+	mov	rdi, [rsp + _gpr_save + 8*7]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+	ret
+
+small_buffer:
+	; For buffers <= 16 Bytes
+	; The input data is set to final block
+	lea	tmp, [lane_data + _xcbc_final_block] ; final block
+	mov	[state + _aes_xcbc_args_in + lane*8], tmp
+	add	p, len		; set point to end of data
+	cmp	len, 16
+	je	fast_copy
+
+slow_copy:
+	and	len, ~15	; take final block off len
+	sub	p, last_len	; adjust data pointer
+	lea	p2, [lane_data + _xcbc_final_block + 16] ; upper part of final
+	sub	p2, last_len	; adjust data pointer backwards
+	memcpy_sse_16_1 p2, p, last_len, tmp, tmp2
+        movdqa	xmm0, [rel x80]	; fill reg with padding
+	movdqu	[lane_data + _xcbc_final_block + 16], xmm0 ; add padding
+	movdqu	xmm0, [p2]	; load final block to process
+	mov	tmp, [job + _k3] ; load K3 address
+	movdqu	xmm1, [tmp]	; load K3
+	pxor	xmm0, xmm1	; M[n] XOR K3
+	movdqu	[lane_data + _xcbc_final_block], xmm0	; write final block
+	jmp	end_fast_copy
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm
new file mode 100644
index 000000000..ac1bb8691
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm
@@ -0,0 +1,305 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;;	calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;;	calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers:		RAX RBX RCX RDX RBP RSI RDI R8  R9  R10 R11 R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Windows clobbers:	RAX     RCX RDX             R8
+;; Windows preserves:	    RBX         RBP RSI RDI     R9  R10 R11 R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Linux clobbers:	RAX                 RSI RDI R8
+;; Linux preserves:	    RBX RCX RDX RBP             R9  R10 R11 R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha1_ni
+
+section .data
+default rel
+
+align 16
+byteswap:	;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+one:
+	dq 1
+
+section .text
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx             rbp
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2		rbx
+
+%define job_rax         rax
+%define	tmp1		rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3		arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4		r8
+%define p2		r8
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save:	resq	4
+_rsp_save:	resq	1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state)
+; arg 1 : state
+MKGLOBAL(flush_job_hmac_ni_sse,function,internal)
+flush_job_hmac_ni_sse:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*2], rsi
+	mov	[rsp + _gpr_save + 8*3], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+        DBGPRINTL "enter sha1-ni-sse flush"
+	mov	unused_lanes, [state + _unused_lanes]
+	bt	unused_lanes, 16+7
+	jc	return_null
+
+	; find a lane with a non-null job, assume it is 0 then check 1
+	xor	idx, idx
+	cmp	qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [rel one]
+	DBGPRINTL64 "idx:", idx
+
+copy_lane_data:
+	; copy valid lane (idx) to empty lanes
+	mov	tmp, [state + _args_data_ptr + PTR_SZ*idx]
+	movzx	len2, word [state + _lens + idx*2]
+
+	DBGPRINTL64 "ptr", tmp
+
+	; there are only two lanes so if one is empty it is easy to determine which one
+	xor	idx, 1
+	mov	[state + _args_data_ptr + PTR_SZ*idx], tmp
+	xor	idx, 1
+
+	; No need to find min length - only two lanes available
+        cmp	len2, 0
+        je	len_is_0
+
+	; Set length on both lanes to 0
+	mov	dword [state + _lens], 0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha1_ni
+	; state is intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+	cmp	extra_blocks, 0
+	jne	proc_extra_blocks
+	cmp	dword [lane_data + _outer_done], 0
+	jne	end_loop
+
+proc_outer:
+	mov	dword [lane_data + _outer_done], 1
+	mov	DWORD(size_offset), [lane_data + _size_offset]
+	mov	qword [lane_data + _extra_block + size_offset], 0
+	mov	word [state + _lens + 2*idx], 1
+	DBGPRINTL64 "outer-block-index", idx
+	lea	tmp, [lane_data + _outer_block]
+	DBGPRINTL64 "outer block ptr:", tmp
+	mov	[state + _args_data_ptr + PTR_SZ*idx], tmp
+
+        ;; idx determines which column
+        ;; read off from consecutive rows
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+	lea	p2, [idx + idx*4]
+	movdqu	xmm0, [state + _args_digest + p2*4]
+	pshufb	xmm0, [rel byteswap]
+	mov	DWORD(tmp),  [state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE]
+	bswap	DWORD(tmp)
+	movdqa	[lane_data + _outer_block], xmm0
+	mov	[lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+        DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0
+        DBGPRINTL64 "sha1 outer hash input word 4", tmp
+	mov	job, [lane_data + _job_in_lane]
+	mov	tmp, [job + _auth_key_xor_opad]
+	movdqu	xmm0, [tmp]
+	mov	DWORD(tmp),  [tmp + 4*SHA1_DIGEST_WORD_SIZE]
+	movdqu	[state + _args_digest + p2*4], xmm0
+	mov	[state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+	jmp	copy_lane_data
+
+	align	16
+proc_extra_blocks:
+	mov	DWORD(start_offset), [lane_data + _start_offset]
+	DBGPRINTL64 "extra blocks-start offset", start_offset
+	mov	[state + _lens + 2*idx], WORD(extra_blocks)
+	DBGPRINTL64 "extra blocks-len", extra_blocks
+	lea	tmp, [lane_data + _extra_block + start_offset]
+	DBGPRINTL64 "extra block ptr", tmp
+	mov	[state + _args_data_ptr + PTR_SZ*idx], tmp
+	mov	dword [lane_data + _extra_blocks], 0
+	jmp	copy_lane_data
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+	align	16
+end_loop:
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	mov	p, [job_rax + _auth_tag_output]
+
+	; copy 12 bytes
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+	lea	idx, [idx + idx*4]
+	mov	DWORD(tmp2), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE]
+	mov	DWORD(tmp4), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE]
+	bswap	DWORD(tmp2)
+	bswap	DWORD(tmp4)
+	mov	[p + 0*4], DWORD(tmp2)
+	mov	[p + 1*4], DWORD(tmp4)
+	mov	DWORD(tmp2), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE]
+	bswap	DWORD(tmp2)
+	mov	[p + 2*4], DWORD(tmp2)
+
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 12
+        je      clear_ret
+
+        ;; copy remaining 8 bytes to return 20 byte digest
+        mov	DWORD(tmp2), [state + _args_digest + idx*4 + 3*SHA1_DIGEST_WORD_SIZE]
+        mov	DWORD(tmp4), [state + _args_digest + idx*4 + 4*SHA1_DIGEST_WORD_SIZE]
+        bswap	DWORD(tmp2)
+        bswap	DWORD(tmp4)
+        mov	[p + 3*4], DWORD(tmp2)
+        mov	[p + 4*4], DWORD(tmp4)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+
+        ;; Clear digest (20B), outer_block (20B) and extra_block (64B)
+        ;; of returned job and NULL jobs
+%assign I 0
+%rep 2
+	cmp	qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+	jne	APPEND(skip_clear_,I)
+
+        ;; Clear digest
+        movdqu  [state + _args_digest + I*20], xmm0
+        mov     dword [state + _args_digest + I*20 + 16], 0
+
+        lea     lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 20 bytes of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+        mov     dword [lane_data + _outer_block + 16], 0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*2]
+	mov	rdi, [rsp + _gpr_save + 8*3]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm
new file mode 100644
index 000000000..0f760b01c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm
@@ -0,0 +1,302 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha1_mult_sse
+
+section .data
+default rel
+
+align 16
+byteswap:	;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+x80:    ;ddq 0x00000000000000000000000000000080
+        dq 0x0000000000000080, 0x0000000000000000
+x00:    ;ddq 0x00000000000000000000000000000000
+        dq 0x0000000000000000, 0x0000000000000000
+len_masks:
+	;ddq 0x0000000000000000000000000000FFFF
+	dq 0x000000000000FFFF, 0x0000000000000000
+	;ddq 0x000000000000000000000000FFFF0000
+	dq 0x00000000FFFF0000, 0x0000000000000000
+	;ddq 0x00000000000000000000FFFF00000000
+	dq 0x0000FFFF00000000, 0x0000000000000000
+	;ddq 0x0000000000000000FFFF000000000000
+	dq 0xFFFF000000000000, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx             rbp
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2		rbx
+
+%define job_rax         rax
+%define	tmp1		rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3		arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4		r8
+
+%endif
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save:	resq	2
+_rsp_save:	resq	1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state)
+; arg 1 : state
+MKGLOBAL(flush_job_hmac_sse,function,internal)
+flush_job_hmac_sse:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _rsp_save], rax	; original SP
+
+         DBGPRINTL "enter sha1-sse flush"
+	mov	unused_lanes, [state + _unused_lanes]
+	bt	unused_lanes, 32+7
+	jc	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [rel one]
+	cmp	qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [rel two]
+	cmp	qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [rel three]
+copy_lane_data:
+	; copy valid lane (idx) to empty lanes
+	movdqa	xmm0, [state + _lens]
+	mov	tmp, [state + _args_data_ptr + PTR_SZ*idx]
+
+%assign I 0
+%rep 4
+	cmp	qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args_data_ptr + PTR_SZ*I], tmp
+	por	xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	movdqa	[state + _lens], xmm0
+
+	phminposuw	xmm1, xmm0
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshuflw	xmm1, xmm1, 0
+	psubw	xmm0, xmm1
+	movdqa	[state + _lens], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha1_mult_sse
+	; state is intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+	cmp	extra_blocks, 0
+	jne	proc_extra_blocks
+	cmp	dword [lane_data + _outer_done], 0
+	jne	end_loop
+
+proc_outer:
+	mov	dword [lane_data + _outer_done], 1
+	mov	DWORD(size_offset), [lane_data + _size_offset]
+	mov	qword [lane_data + _extra_block + size_offset], 0
+	mov	word [state + _lens + 2*idx], 1
+	lea	tmp, [lane_data + _outer_block]
+	mov	job, [lane_data + _job_in_lane]
+	mov	[state + _args_data_ptr + PTR_SZ*idx], tmp
+
+        ;; idx determines which column
+        ;; read off from consecutive rows
+	movd	xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+	pinsrd	xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+	pinsrd	xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+	pinsrd	xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+	pshufb	xmm0, [rel byteswap]
+	mov	DWORD(tmp),  [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+	bswap	DWORD(tmp)
+	movdqa	[lane_data + _outer_block], xmm0
+	mov	[lane_data + _outer_block + 4*4], DWORD(tmp)
+        DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0
+        DBGPRINTL64 "sha1 outer hash input word 4", tmp
+	mov	tmp, [job + _auth_key_xor_opad]
+	movdqu	xmm0, [tmp]
+	mov	DWORD(tmp),  [tmp + 4*4]
+	movd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+	pextrd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+	pextrd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+	pextrd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+	mov	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+	jmp	copy_lane_data
+
+	align	16
+proc_extra_blocks:
+	mov	DWORD(start_offset), [lane_data + _start_offset]
+	mov	[state + _lens + 2*idx], WORD(extra_blocks)
+	lea	tmp, [lane_data + _extra_block + start_offset]
+	mov	[state + _args_data_ptr + PTR_SZ*idx], tmp
+	mov	dword [lane_data + _extra_blocks], 0
+	jmp	copy_lane_data
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+	align	16
+end_loop:
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	mov	p, [job_rax + _auth_tag_output]
+
+	; copy 12 bytes
+	mov	DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+	bswap	DWORD(tmp2)
+	bswap	DWORD(tmp4)
+	mov	[p + 0*4], DWORD(tmp2)
+	mov	[p + 1*4], DWORD(tmp4)
+	mov	DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+	bswap	DWORD(tmp2)
+	mov	[p + 2*4], DWORD(tmp2)
+
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 12
+        je      clear_ret
+
+        ;; copy remaining 8 bytes to return 20 byte digest
+        mov	DWORD(tmp2),  [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+        mov	DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+        bswap	DWORD(tmp2)
+        bswap	DWORD(tmp4)
+        mov	[p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+        mov	[p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp4)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+
+        ;; Clear digest (20B), outer_block (20B) and extra_block (64B)
+        ;; of returned job and NULL jobs
+%assign I 0
+%rep 4
+	cmp	qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+	jne	APPEND(skip_clear_,I)
+
+        ;; Clear digest
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 0*SHA1_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 1*SHA1_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 2*SHA1_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 3*SHA1_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+        lea     lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 20 bytes of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+        mov     dword [lane_data + _outer_block + 16], 0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm
new file mode 100644
index 000000000..d23f37976
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm
@@ -0,0 +1,318 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern md5_x4x2_sse
+
+section .data
+default rel
+align 16
+dupw:	;ddq 0x01000100010001000100010001000100
+	dq 0x0100010001000100, 0x0100010001000100
+len_masks:
+	;ddq 0x0000000000000000000000000000FFFF
+	dq 0x000000000000FFFF, 0x0000000000000000
+	;ddq 0x000000000000000000000000FFFF0000
+	dq 0x00000000FFFF0000, 0x0000000000000000
+	;ddq 0x00000000000000000000FFFF00000000
+	dq 0x0000FFFF00000000, 0x0000000000000000
+	;ddq 0x0000000000000000FFFF000000000000
+	dq 0xFFFF000000000000, 0x0000000000000000
+	;ddq 0x000000000000FFFF0000000000000000
+	dq 0x0000000000000000, 0x000000000000FFFF
+	;ddq 0x00000000FFFF00000000000000000000
+	dq 0x0000000000000000, 0x00000000FFFF0000
+	;ddq 0x0000FFFF000000000000000000000000
+	dq 0x0000000000000000, 0x0000FFFF00000000
+	;ddq 0xFFFF0000000000000000000000000000
+	dq 0x0000000000000000, 0xFFFF000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+four:	dq  4
+five:	dq  5
+six:	dq  6
+seven:	dq  7
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbp
+%define idx             rbp
+
+; unused_lanes must be in rax-rdx
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2		rbx
+
+%define job_rax         rax
+%define	tmp1		rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3		arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4		r8
+%define tmp5		r9
+
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save:	resq	8
+_rsp_save:	resq	1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_md5_sse,function,internal)
+flush_job_hmac_md5_sse:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _gpr_save + 8*2], r12
+	mov	[rsp + _gpr_save + 8*3], r13
+	mov	[rsp + _gpr_save + 8*4], r14
+	mov	[rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*6], rsi
+	mov	[rsp + _gpr_save + 8*7], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+	mov	unused_lanes, [state + _unused_lanes_md5]
+	bt	unused_lanes, 32+3
+	jc	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+	cmovne	idx, [rel one]
+	cmp	qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+	cmovne	idx, [rel two]
+	cmp	qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+	cmovne	idx, [rel three]
+	cmp	qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+	cmovne	idx, [rel four]
+	cmp	qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+	cmovne	idx, [rel five]
+	cmp	qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+	cmovne	idx, [rel six]
+	cmp	qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+	cmovne	idx, [rel seven]
+
+copy_lane_data:
+	; copy good lane (idx) to empty lanes
+	movdqa	xmm0, [state + _lens_md5]
+	mov	tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx]
+
+%assign I 0
+%rep 8
+	cmp	qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args_data_ptr_md5 + PTR_SZ*I], tmp
+	por	xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	movdqa	[state + _lens_md5], xmm0
+
+	phminposuw	xmm1, xmm0
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshufb	xmm1, [rel dupw]	; duplicate words across all lanes
+	psubw	xmm0, xmm1
+	movdqa	[state + _lens_md5], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	md5_x4x2_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+	lea	lane_data, [state + _ldata_md5 + lane_data]
+	mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+	cmp	extra_blocks, 0
+	jne	proc_extra_blocks
+	cmp	dword [lane_data + _outer_done], 0
+	jne	end_loop
+
+proc_outer:
+	mov	dword [lane_data + _outer_done], 1
+	mov	DWORD(size_offset), [lane_data + _size_offset]
+	mov	qword [lane_data + _extra_block + size_offset], 0
+	mov	word [state + _lens_md5 + 2*idx], 1
+	lea	tmp, [lane_data + _outer_block]
+	mov	job, [lane_data + _job_in_lane]
+	mov	[state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+	movd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+	pinsrd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+	pinsrd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+	pinsrd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+;	pshufb	xmm0, [byteswap wrt rip]
+	movdqa	[lane_data + _outer_block], xmm0
+
+	mov	tmp, [job + _auth_key_xor_opad]
+	movdqu	xmm0, [tmp]
+	movd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+	pextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+	pextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+	pextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+	jmp	copy_lane_data
+
+	align	16
+proc_extra_blocks:
+	mov	DWORD(start_offset), [lane_data + _start_offset]
+	mov	[state + _lens_md5 + 2*idx], WORD(extra_blocks)
+	lea	tmp, [lane_data + _extra_block + start_offset]
+	mov	[state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+	mov	dword [lane_data + _extra_blocks], 0
+	jmp	copy_lane_data
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+	align	16
+end_loop:
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	mov	unused_lanes, [state + _unused_lanes_md5]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes_md5], unused_lanes
+
+	mov	p, [job_rax + _auth_tag_output]
+
+	; copy 12 bytes
+	mov	DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+;	bswap	DWORD(tmp2)
+;	bswap	DWORD(tmp4)
+;	bswap	DWORD(tmp3)
+	mov	[p + 0*4], DWORD(tmp2)
+	mov	[p + 1*4], DWORD(tmp4)
+	mov	[p + 2*4], DWORD(tmp5)
+
+        cmp     DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
+        je      clear_ret
+
+        ; copy 16 bytes
+        mov	DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
+        mov	[p + 3*4], DWORD(tmp5)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+
+        ;; Clear digest (16B), outer_block (16B) and extra_block (64B)
+        ;; of returned job and NULL jobs
+%assign I 0
+%rep 8
+	cmp	qword [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+	jne	APPEND(skip_clear_,I)
+
+        ;; Clear digest (16 bytes)
+%assign J 0
+%rep 4
+        mov     dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*I + J*MD5_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+
+        lea     lane_data, [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size)]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 16 bytes of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+	mov	r13, [rsp + _gpr_save + 8*3]
+	mov	r14, [rsp + _gpr_save + 8*4]
+	mov	r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*6]
+	mov	rdi, [rsp + _gpr_save + 8*7]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm
new file mode 100644
index 000000000..acf78fd6d
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm
@@ -0,0 +1,356 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/memcpy.asm"
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+
+extern md5_x4x2_sse
+
+section .data
+default rel
+align 16
+;byteswap:	ddq 0x0c0d0e0f08090a0b0405060700010203
+dupw:	;ddq 0x01000100010001000100010001000100
+	dq 0x0100010001000100, 0x0100010001000100
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define reg3	rcx
+%define reg4	rdx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define reg3	rdi
+%define reg4	rsi
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbp
+%define last_len        rbp
+%define idx             rbp
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+%define tmp4            rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define size_offset     reg3
+%define tmp2		reg3
+
+%define lane            reg4
+%define tmp3		reg4
+
+%define extra_blocks    r8
+
+%define tmp             r9
+%define p2              r9
+
+%define lane_data       r10
+
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save:	resq	8
+_rsp_save:	resq	1
+endstruc
+
+; JOB* submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_md5_sse,function,internal)
+submit_job_hmac_md5_sse:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _gpr_save + 8*2], r12
+	mov	[rsp + _gpr_save + 8*3], r13
+	mov	[rsp + _gpr_save + 8*4], r14
+	mov	[rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*6], rsi
+	mov	[rsp + _gpr_save + 8*7], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+        mov	unused_lanes, [state + _unused_lanes_md5]
+        mov	lane, unused_lanes
+        and	lane, 0xF
+        shr	unused_lanes, 4
+        imul	lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+        lea	lane_data, [state + _ldata_md5 + lane_data]
+        mov	[state + _unused_lanes_md5], unused_lanes
+        mov	len, [job + _msg_len_to_hash_in_bytes]
+        mov	tmp, len
+        shr	tmp, 6	; divide by 64, len in terms of blocks
+
+        mov	[lane_data + _job_in_lane], job
+        mov	dword [lane_data + _outer_done], 0
+
+        ;; insert len into proper lane
+        movdqa  xmm0, [state + _lens_md5]
+        XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+        movdqa  [state + _lens_md5], xmm0
+
+        mov	last_len, len
+        and	last_len, 63
+        lea	extra_blocks, [last_len + 9 + 63]
+        shr	extra_blocks, 6
+        mov	[lane_data + _extra_blocks], DWORD(extra_blocks)
+
+        mov	p, [job + _src]
+        add	p, [job + _hash_start_src_offset_in_bytes]
+        mov	[state + _args_data_ptr_md5 + PTR_SZ*lane], p
+
+        cmp	len, 64
+        jb	copy_lt64
+
+fast_copy:
+        add	p, len
+        movdqu	xmm0, [p - 64 + 0*16]
+        movdqu	xmm1, [p - 64 + 1*16]
+        movdqu	xmm2, [p - 64 + 2*16]
+        movdqu	xmm3, [p - 64 + 3*16]
+        movdqa	[lane_data + _extra_block + 0*16], xmm0
+        movdqa	[lane_data + _extra_block + 1*16], xmm1
+        movdqa	[lane_data + _extra_block + 2*16], xmm2
+        movdqa	[lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+        mov	size_offset, extra_blocks
+        shl	size_offset, 6
+        sub	size_offset, last_len
+        add	size_offset, 64-8
+        mov	[lane_data + _size_offset], DWORD(size_offset)
+        mov	start_offset, 64
+        sub	start_offset, last_len
+        mov	[lane_data + _start_offset], DWORD(start_offset)
+
+        lea	tmp, [8*64 + 8*len]
+;	bswap	tmp
+        mov	[lane_data + _extra_block + size_offset], tmp
+
+        mov	tmp, [job + _auth_key_xor_ipad]
+        movdqu	xmm0, [tmp]
+        movd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0
+        pextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+        pextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+        pextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+
+        test	len, ~63
+        jnz	ge64_bytes
+
+lt64_bytes:
+        movdqa  xmm0, [state + _lens_md5]
+        XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+        movdqa  [state + _lens_md5], xmm0
+
+        lea	tmp, [lane_data + _extra_block + start_offset]
+        mov	[state + _args_data_ptr_md5 + PTR_SZ*lane], tmp
+        mov	dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+        cmp	unused_lanes, 0xf
+        jne	return_null
+        jmp	start_loop
+
+        align	16
+start_loop:
+        ; Find min length
+        movdqa	xmm0, [state + _lens_md5]
+        phminposuw	xmm1, xmm0
+        pextrw	len2, xmm1, 0	; min value
+        pextrw	idx, xmm1, 1	; min index (0...3)
+        cmp	len2, 0
+        je	len_is_0
+
+        pshufb	xmm1, [rel dupw]	; duplicate words across all lanes
+        psubw	xmm0, xmm1
+        movdqa	[state + _lens_md5], xmm0
+
+        ; "state" and "args" are the same address, arg1
+        ; len is arg2
+        call	md5_x4x2_sse
+        ; state and idx are intact
+
+len_is_0:
+        ; process completed job "idx"
+        imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+        lea	lane_data, [state + _ldata_md5 + lane_data]
+        mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+        cmp	extra_blocks, 0
+        jne	proc_extra_blocks
+        cmp	dword [lane_data + _outer_done], 0
+        jne	end_loop
+
+proc_outer:
+        mov	dword [lane_data + _outer_done], 1
+        mov	DWORD(size_offset), [lane_data + _size_offset]
+        mov	qword [lane_data + _extra_block + size_offset], 0
+
+        movdqa  xmm0, [state + _lens_md5]
+        XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+        movdqa  [state + _lens_md5], xmm0
+
+        lea	tmp, [lane_data + _outer_block]
+        mov	job, [lane_data + _job_in_lane]
+        mov	[state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+        movd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+        pinsrd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+        pinsrd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+        pinsrd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+;	pshufb	xmm0, [rel byteswap]
+        movdqa	[lane_data + _outer_block], xmm0
+
+        mov	tmp, [job + _auth_key_xor_opad]
+        movdqu	xmm0, [tmp]
+        movd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+        pextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+        pextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+        pextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+        jmp	start_loop
+
+        align	16
+proc_extra_blocks:
+        mov	DWORD(start_offset), [lane_data + _start_offset]
+
+        movdqa  xmm0, [state + _lens_md5]
+        XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+        movdqa  [state + _lens_md5], xmm0
+
+        lea	tmp, [lane_data + _extra_block + start_offset]
+        mov	[state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+        mov	dword [lane_data + _extra_blocks], 0
+        jmp	start_loop
+
+        align	16
+
+copy_lt64:
+        ;; less than one message block of data
+        ;; beginning of source block
+        ;; destination extrablock but backwards by len from where 0x80 pre-populated
+        ;; p2 clobbers unused_lanes, undo before exiting
+        lea	p2, [lane_data + _extra_block  + 64]
+        sub     p2, len
+        memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+        mov	unused_lanes, [state + _unused_lanes_md5]
+        jmp	end_fast_copy
+
+return_null:
+        xor	job_rax, job_rax
+        jmp	return
+
+        align	16
+end_loop:
+        mov	job_rax, [lane_data + _job_in_lane]
+        mov	unused_lanes, [state + _unused_lanes_md5]
+        mov	qword [lane_data + _job_in_lane], 0
+        or	dword [job_rax + _status], STS_COMPLETED_HMAC
+        shl	unused_lanes, 4
+        or	unused_lanes, idx
+        mov	[state + _unused_lanes_md5], unused_lanes
+
+        mov	p, [job_rax + _auth_tag_output]
+
+        ; copy 12 bytes
+        mov	DWORD(tmp),  [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+        mov	DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+        mov	DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+        mov	[p + 0*4], DWORD(tmp)
+        mov	[p + 1*4], DWORD(tmp2)
+        mov	[p + 2*4], DWORD(tmp3)
+
+        cmp     DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
+        je      clear_ret
+
+        ; copy 16 bytes
+        mov	DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
+        mov	[p + 3*4], DWORD(tmp3)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        ;; Clear digest (16B), outer_block (16B) and extra_block (64B) of returned job
+        mov     dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 0
+
+        pxor    xmm0, xmm0
+        imul    lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+        lea     lane_data, [state + _ldata_md5 + lane_data]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 16 bytes of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+%endif
+
+return:
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+	mov	r13, [rsp + _gpr_save + 8*3]
+	mov	r14, [rsp + _gpr_save + 8*4]
+	mov	r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*6]
+	mov	rdi, [rsp + _gpr_save + 8*7]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+        ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm
new file mode 100644
index 000000000..23fcd74d7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm
@@ -0,0 +1,28 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%define SHA224
+%include "sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm
new file mode 100644
index 000000000..e1f11a44f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_224_sse
+%define SHA224
+
+%include "sse/mb_mgr_hmac_sha_256_flush_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm
new file mode 100644
index 000000000..12c0350af
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm
@@ -0,0 +1,28 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%define SHA224
+%include "sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm
new file mode 100644
index 000000000..111f5092c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_224_sse
+%define SHA224
+
+%include "sse/mb_mgr_hmac_sha_256_submit_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm
new file mode 100644
index 000000000..9a2f20ffc
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm
@@ -0,0 +1,333 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;;	calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;;	calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha256_ni
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define idx		rbp
+
+%define unused_lanes	rbx
+%define lane_data	rbx
+%define tmp2		rbx
+
+%define job_rax		rax
+%define	tmp1		rax
+%define size_offset	rax
+%define tmp		rax
+%define start_offset	rax
+
+%define tmp3		arg1
+
+%define extra_blocks	arg2
+%define p		arg2
+
+%define tmp4		r8
+
+%define tmp5	        r9
+
+%define tmp6	        r10
+
+%define bswap_xmm4	xmm4
+
+struc STACK
+_gpr_save:	resq	4 ;rbx, rbp, rsi (win), rdi (win)
+_rsp_save:	resq	1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+section .data
+default rel
+
+align 16
+byteswap:
+	dq 0x0405060700010203
+	dq 0x0c0d0e0f08090a0b
+
+one:	dq  1
+
+section .text
+
+%ifdef SHA224
+;; JOB* flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state)
+;; arg1 : state
+MKGLOBAL(flush_job_hmac_sha_224_ni_sse,function,internal)
+flush_job_hmac_sha_224_ni_sse:
+%else
+;; JOB* flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state)
+;; arg1 : state
+MKGLOBAL(flush_job_hmac_sha_256_ni_sse,function,internal)
+flush_job_hmac_sha_256_ni_sse:
+%endif
+	mov	rax, rsp
+	sub	rsp, STACK_size
+	and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*2], rsi
+	mov	[rsp + _gpr_save + 8*3], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+        DBGPRINTL "enter sha256-ni-sse flush"
+
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	bt	unused_lanes, 16+7
+	jc	return_null
+
+	; find a lane with a non-null job, assume it is 0 then check 1
+	xor	idx, idx
+	cmp	qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [rel one]
+	DBGPRINTL64 "idx:", idx
+
+copy_lane_data:
+	; copy idx to empty lanes
+	mov	tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx]
+	xor	len2, len2
+	mov	WORD(len2), word [state + _lens_sha256 + idx*2]
+
+	; there are only two lanes so if one is empty it is easy to determine which one
+	xor	idx, 1
+	mov	[state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+	xor	idx, 1
+
+	; No need to find min length - only two lanes available
+        cmp	len2, 0
+        je	len_is_0
+
+	; set length on both lanes to 0
+	mov	dword [state + _lens_sha256], 0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha256_ni
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+	lea	lane_data, [state + _ldata_sha256 + lane_data]
+	mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+	cmp	extra_blocks, 0
+	jne	proc_extra_blocks
+	movdqa	bswap_xmm4, [rel byteswap]
+	cmp	dword [lane_data + _outer_done], 0
+	jne	end_loop
+
+proc_outer:
+	mov	dword [lane_data + _outer_done], 1
+	mov	DWORD(size_offset), [lane_data + _size_offset]
+	mov	qword [lane_data + _extra_block + size_offset], 0
+	mov	word [state + _lens_sha256 + 2*idx], 1
+	lea	tmp, [lane_data + _outer_block]
+	mov	job, [lane_data + _job_in_lane]
+	mov	[state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+	lea	tmp4, [idx*8]	 ; x8 here + scale factor x4 below give x32
+	movdqu	xmm0, [state + _args_digest_sha256 + tmp4*4]
+	movdqu	xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4]
+	pshufb	xmm0, bswap_xmm4
+	pshufb	xmm1, bswap_xmm4
+	movdqa	[lane_data + _outer_block], xmm0
+	movdqa	[lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+	;; overwrite top 4 bytes with 0x80
+	mov	dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+        DBGPRINTL	"sha256 outer hash input words:"
+        DBGPRINT_XMM xmm0
+        DBGPRINT_XMM xmm1
+
+	mov	tmp, [job + _auth_key_xor_opad]
+	movdqu	xmm0, [tmp]
+	movdqu	xmm1, [tmp + 4*4]
+	DBGPRINTL64 "auth_key_xor_opad", tmp
+	movdqu	[state + _args_digest_sha256 + tmp4*4], xmm0
+	movdqu	[state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1
+        DBGPRINTL	"new digest args"
+        DBGPRINT_XMM xmm0
+        DBGPRINT_XMM xmm1
+	jmp	copy_lane_data
+
+	align	16
+proc_extra_blocks:
+	mov	DWORD(start_offset), [lane_data + _start_offset]
+	mov	[state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+	lea	tmp, [lane_data + _extra_block + start_offset]
+	mov	[state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+	mov	dword [lane_data + _extra_blocks], 0
+	jmp	copy_lane_data
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+	align	16
+end_loop:
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes_sha256], unused_lanes
+
+	mov	p, [job_rax + _auth_tag_output]
+
+	; copy 16 bytes for SHA256, 14 bytes for SHA224
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+	shl	idx, 5
+
+%ifdef SHA224
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 14
+        jne     copy_full_digest
+%else
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 16
+        jne     copy_full_digest
+%endif
+	movdqu	xmm0, [state + _args_digest_sha256 + idx]
+	pshufb	xmm0, bswap_xmm4
+%ifdef SHA224
+	;; SHA224
+	movq	[p + 0*4], xmm0
+	pextrd	[p + 2*4], xmm0, 2
+	pextrw	[p + 3*4], xmm0, 6
+%else
+	;; SHA256
+	movdqu	[p], xmm0
+%endif
+	DBGPRINTL	"auth_tag_output:"
+        DBGPRINT_XMM	xmm0
+        jmp     clear_ret
+
+copy_full_digest:
+	movdqu	xmm0,  [state + _args_digest_sha256 + idx]
+	movdqu	xmm1,  [state + _args_digest_sha256 + idx + 16]
+	pshufb	xmm0, bswap_xmm4
+	pshufb	xmm1, bswap_xmm4
+%ifdef SHA224
+	;; SHA224
+	movdqu	[p], xmm0
+	movq	[p + 16], xmm1
+	pextrd	[p + 16 + 8], xmm1, 2
+%else
+	;; SHA256
+	movdqu	[p], xmm0
+	movdqu	[p + 16], xmm1
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+
+        ;; Clear digest, outer_block (28B/32B) and extra_block (64B)
+        ;; of returned job and NULL jobs
+%assign I 0
+%rep 2
+	cmp	qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+	jne	APPEND(skip_clear_,I)
+
+        ;; Clear digest
+        movdqa  [state + _args_digest_sha256 + I*32], xmm0
+        movdqa  [state + _args_digest_sha256 + I*32 + 16], xmm0
+
+        lea     lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+%ifdef SHA224
+        mov     qword [lane_data + _outer_block + 16], 0
+        mov     dword [lane_data + _outer_block + 24], 0
+%else
+        movdqa  [lane_data + _outer_block + 16], xmm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+        DBGPRINTL "exit sha256-ni-sse flush"
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*2]
+	mov	rdi, [rsp + _gpr_save + 8*3]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm
new file mode 100644
index 000000000..5ab064b89
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm
@@ -0,0 +1,356 @@
+    ;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha_256_mult_sse
+
+section .data
+default rel
+
+align 16
+byteswap:	;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+len_masks:
+	;ddq 0x0000000000000000000000000000FFFF
+	dq 0x000000000000FFFF, 0x0000000000000000
+	;ddq 0x000000000000000000000000FFFF0000
+	dq 0x00000000FFFF0000, 0x0000000000000000
+	;ddq 0x00000000000000000000FFFF00000000
+	dq 0x0000FFFF00000000, 0x0000000000000000
+	;ddq 0x0000000000000000FFFF000000000000
+	dq 0xFFFF000000000000, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_256_sse
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define idx		rbp
+
+%define unused_lanes	rbx
+%define lane_data	rbx
+%define tmp2		rbx
+
+%define job_rax		rax
+%define	tmp1		rax
+%define size_offset	rax
+%define tmp		rax
+%define start_offset	rax
+
+%define tmp3		arg1
+
+%define extra_blocks	arg2
+%define p		arg2
+
+%define tmp4		r8
+
+%define tmp5	        r9
+
+%define tmp6	        r10
+
+%endif
+
+; This routine clobbers rbx, rbp; called routine also clobbers r12
+struc STACK
+_gpr_save:	resq	3
+_rsp_save:	resq	1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+	mov	rax, rsp
+	sub	rsp, STACK_size
+	and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _gpr_save + 8*2], r12
+	mov	[rsp + _rsp_save], rax	; original SP
+
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	bt	unused_lanes, 32+7
+	jc	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [rel one]
+	cmp	qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [rel two]
+	cmp	qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [rel three]
+
+copy_lane_data:
+	; copy idx to empty lanes
+	movdqa	xmm0, [state + _lens_sha256]
+	mov	tmp, [state + _args_data_ptr_sha256 + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp	qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args_data_ptr_sha256 + 8*I], tmp
+	por	xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	movdqa	[state + _lens_sha256], xmm0
+
+	phminposuw	xmm1, xmm0
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshuflw	xmm1, xmm1, 0
+	psubw	xmm0, xmm1
+	movdqa	[state + _lens_sha256], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha_256_mult_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+	lea	lane_data, [state + _ldata_sha256 + lane_data]
+	mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+	cmp	extra_blocks, 0
+	jne	proc_extra_blocks
+	cmp	dword [lane_data + _outer_done], 0
+	jne	end_loop
+
+proc_outer:
+	mov	dword [lane_data + _outer_done], 1
+	mov	DWORD(size_offset), [lane_data + _size_offset]
+	mov	qword [lane_data + _extra_block + size_offset], 0
+	mov	word [state + _lens_sha256 + 2*idx], 1
+	lea	tmp, [lane_data + _outer_block]
+	mov	job, [lane_data + _job_in_lane]
+	mov	[state + _args_data_ptr_sha256 + 8*idx], tmp
+
+	movd	xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+	pinsrd	xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+	pinsrd	xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+	pinsrd	xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+	pshufb	xmm0, [rel byteswap]
+	movd	xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+	pinsrd	xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+	pinsrd	xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+	pinsrd	xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+	pshufb	xmm1, [rel byteswap]
+	movdqa	[lane_data + _outer_block], xmm0
+	movdqa	[lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+	mov		dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+	mov	tmp, [job + _auth_key_xor_opad]
+	movdqu	xmm0, [tmp]
+	movdqu	xmm1, [tmp + 4*4]
+	movd	[state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+	pextrd	[state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+	pextrd	[state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+	pextrd	[state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+	movd	[state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+	pextrd	[state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+	pextrd	[state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+	pextrd	[state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+	jmp	copy_lane_data
+
+	align	16
+proc_extra_blocks:
+	mov	DWORD(start_offset), [lane_data + _start_offset]
+	mov	[state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+	lea	tmp, [lane_data + _extra_block + start_offset]
+	mov	[state + _args_data_ptr_sha256 + 8*idx], tmp
+	mov	dword [lane_data + _extra_blocks], 0
+	jmp	copy_lane_data
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+	align	16
+end_loop:
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes_sha256], unused_lanes
+
+	mov	p, [job_rax + _auth_tag_output]
+
+%ifdef SHA224
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 14
+        jne     copy_full_digest
+%else
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 16
+        jne     copy_full_digest
+%endif
+	;; copy 14 bytes for SHA224 / 16 bytes for SHA256
+	mov	DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+	bswap	DWORD(tmp2)
+	bswap	DWORD(tmp4)
+	bswap	DWORD(tmp6)
+	bswap	DWORD(tmp5)
+	mov	[p + 0*4], DWORD(tmp2)
+	mov	[p + 1*4], DWORD(tmp4)
+	mov	[p + 2*4], DWORD(tmp6)
+%ifdef SHA224
+	mov	[p + 3*4], WORD(tmp5)
+%else
+	mov	[p + 3*4], DWORD(tmp5)
+%endif
+        jmp     clear_ret
+
+copy_full_digest:
+	;; copy 28 bytes for SHA224 / 32 bytes for SHA256
+	mov	DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+	bswap	DWORD(tmp2)
+	bswap	DWORD(tmp4)
+	bswap	DWORD(tmp6)
+	bswap	DWORD(tmp5)
+	mov	[p + 0*4], DWORD(tmp2)
+	mov	[p + 1*4], DWORD(tmp4)
+	mov	[p + 2*4], DWORD(tmp6)
+	mov	[p + 3*4], DWORD(tmp5)
+
+	mov	DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+	mov	DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+	bswap	DWORD(tmp2)
+	bswap	DWORD(tmp4)
+	bswap	DWORD(tmp6)
+%ifndef SHA224
+	bswap	DWORD(tmp5)
+%endif
+	mov	[p + 4*4], DWORD(tmp2)
+	mov	[p + 5*4], DWORD(tmp4)
+	mov	[p + 6*4], DWORD(tmp6)
+%ifndef SHA224
+	mov	[p + 7*4], DWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+
+        ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B)
+        ;; of returned job and NULL jobs
+%assign I 0
+%rep 4
+	cmp	qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+	jne	APPEND(skip_clear_,I)
+
+        ;; Clear digest (28 bytes for SHA-224, 32 bytes for SHA-256 bytes)
+%assign J 0
+%rep 7
+        mov     dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+        mov     dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+        lea     lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+%ifdef SHA224
+        mov     qword [lane_data + _outer_block + 16], 0
+        mov     dword [lane_data + _outer_block + 24], 0
+%else
+        movdqa  [lane_data + _outer_block + 16], xmm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+	mov	rsp, [rsp + _rsp_save]	; original SP
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm
new file mode 100644
index 000000000..d4ded1f6d
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm
@@ -0,0 +1,401 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;;	calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;;	calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha256_ni
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define reg3	rcx
+%define reg4	rdx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define reg3	rdi
+%define reg4	rsi
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define last_len	rbp
+%define idx		rbp
+
+%define p		r11
+%define start_offset	r11
+
+%define unused_lanes	rbx
+%define tmp4		rbx
+
+%define job_rax		rax
+%define len		rax
+
+%define size_offset	reg3
+%define tmp2		reg3
+
+%define lane		reg4
+
+%define extra_blocks	r8
+
+%define tmp		r9
+%define p2		r9
+
+%define lane_data	r10
+
+%define bswap_xmm4	xmm4
+
+struc STACK
+_gpr_save:	resq	4	; rbx, rbp, rsi (win), rdi (win)
+_rsp_save:	resq	1
+endstruc
+
+section .data
+default rel
+
+align 16
+byteswap:
+	dq 0x0405060700010203
+	dq 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifdef SHA224
+; JOB* submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(submit_job_hmac_sha_224_ni_sse,function,internal)
+submit_job_hmac_sha_224_ni_sse:
+
+%else
+
+; JOB* submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(submit_job_hmac_sha_256_ni_sse,function,internal)
+submit_job_hmac_sha_256_ni_sse:
+%endif
+
+	mov	rax, rsp
+	sub	rsp, STACK_size
+	and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*2], rsi
+	mov	[rsp + _gpr_save + 8*3], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+        DBGPRINTL "enter sha256-ni-sse submit"
+
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	movzx	lane, BYTE(unused_lanes)
+	DBGPRINTL64 "lane: ", lane
+	shr	unused_lanes, 8
+	imul	lane_data, lane, _HMAC_SHA1_LANE_DATA_size ; SHA1 & SHA256 lane data is the same
+	lea	lane_data, [state + _ldata_sha256 + lane_data]
+	mov	[state + _unused_lanes_sha256], unused_lanes
+	mov	len, [job + _msg_len_to_hash_in_bytes]
+	DBGPRINTL64 "length: ", len
+	mov	tmp, len
+	shr	tmp, 6	; divide by 64, len in terms of blocks
+
+	mov	[lane_data + _job_in_lane], job
+	mov	dword [lane_data + _outer_done], 0
+	mov	[state + _lens_sha256 + 2*lane], WORD(tmp)
+
+	mov	last_len, len
+	and	last_len, 63
+	lea	extra_blocks, [last_len + 9 + 63]
+	shr	extra_blocks, 6
+	mov	[lane_data + _extra_blocks], DWORD(extra_blocks)
+
+	mov	p, [job + _src]
+	add	p, [job + _hash_start_src_offset_in_bytes]
+	mov	[state + _args_data_ptr_sha256 + 8*lane], p
+
+	cmp	len, 64
+	jb	copy_lt64
+
+fast_copy:
+	add	p, len
+	movdqu	xmm0, [p - 64 + 0*16]
+	movdqu	xmm1, [p - 64 + 1*16]
+	movdqu	xmm2, [p - 64 + 2*16]
+	movdqu	xmm3, [p - 64 + 3*16]
+	movdqa	[lane_data + _extra_block + 0*16], xmm0
+	movdqa	[lane_data + _extra_block + 1*16], xmm1
+	movdqa	[lane_data + _extra_block + 2*16], xmm2
+	movdqa	[lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+	mov	size_offset, extra_blocks
+	shl	size_offset, 6
+	sub	size_offset, last_len
+	add	size_offset, 64-8
+	mov	[lane_data + _size_offset], DWORD(size_offset)
+	mov	start_offset, 64
+	sub	start_offset, last_len
+	mov	[lane_data + _start_offset], DWORD(start_offset)
+
+	lea	tmp, [8*64 + 8*len]
+	bswap	tmp
+	mov	[lane_data + _extra_block + size_offset], tmp
+
+	mov	tmp, [job + _auth_key_xor_ipad]
+	movdqu	xmm0, [tmp]
+	movdqu	xmm1, [tmp + 4*4]
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+	lea	tmp, [lane*8]	; x8 here plus x4 scale factor give x32
+	movdqu	[state + _args_digest_sha256 + tmp*4], xmm0
+	movdqu	[state + _args_digest_sha256 + tmp*4 + 4*4], xmm1
+	DBGPRINTL	"args digest:"
+        DBGPRINT_XMM	xmm0
+        DBGPRINT_XMM	xmm1
+	test	len, ~63
+	jnz	ge64_bytes
+
+lt64_bytes:
+	mov	[state + _lens_sha256 + 2*lane], WORD(extra_blocks)
+	lea	tmp, [lane_data + _extra_block + start_offset]
+	mov	[state + _args_data_ptr_sha256 + 8*lane], tmp
+	mov	dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+	cmp	unused_lanes, 0xff
+	jne	return_null
+	jmp	start_loop
+
+	align	16
+start_loop:
+	; Find min length - only two lanes available
+	xor     len2, len2
+	mov	tmp, 0x10000
+	mov	WORD(len2), word [state + _lens_sha256 + 0*2]	; [0:15] - lane 0 length, [16:31] - lane index (0)
+	mov	WORD(tmp), word [state + _lens_sha256 + 1*2]	; [0:15] - lane 1 length, [16:31] - lane index (1)
+	cmp     WORD(len2), WORD(tmp)
+	cmovg	DWORD(len2), DWORD(tmp)	; move if lane 0 length is greater than lane 1 length
+
+	mov	idx, len2		; retrieve index & length from [16:31] and [0:15] bit fields
+	shr	DWORD(idx), 16
+	and	DWORD(len2), 0xffff
+	je	len_is_0
+
+	sub	word [state + _lens_sha256 + 0*2], WORD(len2)
+	sub	word [state + _lens_sha256 + 1*2], WORD(len2)
+
+	; "state" and "args" are the same address, arg1
+        ; len is arg2
+        call	sha256_ni
+        ; state is intact
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+	lea	lane_data, [state + _ldata_sha256 + lane_data]
+	mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+	cmp	extra_blocks, 0
+	jne	proc_extra_blocks
+	movdqa	bswap_xmm4, [rel byteswap]
+	cmp	dword [lane_data + _outer_done], 0
+	jne	end_loop
+
+proc_outer:
+	mov	dword [lane_data + _outer_done], 1
+	mov	DWORD(size_offset), [lane_data + _size_offset]
+	mov	qword [lane_data + _extra_block + size_offset], 0
+	mov	word [state + _lens_sha256 + 2*idx], 1
+	lea	tmp, [lane_data + _outer_block]
+	mov	job, [lane_data + _job_in_lane]
+	mov	[state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+	lea	tmp4, [idx*8]	; x8 here + scale factor x4 below give x32
+	movdqu	xmm0, [state + _args_digest_sha256 + tmp4*4]
+	movdqu	xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4]
+	pshufb	xmm0, bswap_xmm4
+	pshufb	xmm1, bswap_xmm4
+	movdqa	[lane_data + _outer_block], xmm0
+	movdqa	[lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+	;; overwrite top 4 bytes with 0x80
+	mov	dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+	mov	tmp, [job + _auth_key_xor_opad]
+	movdqu	xmm0, [tmp]
+	movdqu	xmm1,  [tmp + 4*4]
+	movdqu	[state + _args_digest_sha256 + tmp4*4], xmm0
+	movdqu	[state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1
+	jmp	start_loop
+
+	align	16
+proc_extra_blocks:
+	mov	DWORD(start_offset), [lane_data + _start_offset]
+	mov	[state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+	lea	tmp, [lane_data + _extra_block + start_offset]
+	mov	[state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+	mov	dword [lane_data + _extra_blocks], 0
+	jmp	start_loop
+
+	align	16
+
+copy_lt64:
+	;; less than one message block of data
+	;; beginning of source block
+	;; destination extrablock but backwards by len from where 0x80 pre-populated
+	;; p2 clobbers unused_lanes, undo before exit
+	lea	p2, [lane_data + _extra_block  + 64]
+	sub	p2, len
+	memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	jmp	end_fast_copy
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+	align	16
+end_loop:
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	mov	qword [lane_data + _job_in_lane], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes_sha256], unused_lanes
+
+	mov	p, [job_rax + _auth_tag_output]
+
+	; copy 16 bytes for SHA256, 14 for SHA224
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+	shl	idx, 5
+
+%ifdef SHA224
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 14
+        jne     copy_full_digest
+%else
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 16
+        jne     copy_full_digest
+%endif
+
+	movdqu	xmm0,  [state + _args_digest_sha256 + idx]
+	pshufb	xmm0, bswap_xmm4
+%ifdef SHA224
+	;; SHA224
+	movq	[p + 0*4], xmm0
+	pextrd	[p + 2*4], xmm0, 2
+	pextrw	[p + 3*4], xmm0, 6
+%else
+	;; SHA256
+	movdqu	[p], xmm0
+%endif
+        jmp     clear_ret
+
+copy_full_digest:
+	movdqu	xmm0,  [state + _args_digest_sha256 + idx]
+	movdqu	xmm1,  [state + _args_digest_sha256 + idx + 16]
+	pshufb	xmm0, bswap_xmm4
+	pshufb	xmm1, bswap_xmm4
+%ifdef SHA224
+	;; SHA224
+	movdqu	[p], xmm0
+	movq	[p + 16], xmm1
+	pextrd	[p + 16 + 8], xmm1, 2
+%else
+	;; SHA256
+	movdqu	[p], xmm0
+	movdqu	[p + 16], xmm1
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+        ;; Clear digest, outer_block (28B/32B) and extra_block (64B) of returned job
+        movdqa  [state + _args_digest_sha256 + idx], xmm0
+        movdqa  [state + _args_digest_sha256 + idx + 16], xmm0
+
+        shr     idx, 5 ;; Restore lane idx to 0 or 1
+        imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+        lea	lane_data, [state + _ldata_sha256 + lane_data]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+%ifdef SHA224
+        mov     qword [lane_data + _outer_block + 16], 0
+        mov     dword [lane_data + _outer_block + 24], 0
+%else
+        movdqa  [lane_data + _outer_block + 16], xmm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*2]
+	mov	rdi, [rsp + _gpr_save + 8*3]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm
new file mode 100644
index 000000000..8025b2f96
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm
@@ -0,0 +1,427 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha_256_mult_sse
+
+section .data
+default rel
+align 16
+byteswap:	;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_256_sse
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define reg3	rcx
+%define reg4	rdx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define reg3	rdi
+%define reg4	rsi
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define last_len	rbp
+%define idx		rbp
+
+%define p		r11
+%define start_offset	r11
+
+%define unused_lanes	rbx
+%define tmp4		rbx
+
+%define job_rax		rax
+%define len		rax
+
+%define size_offset	reg3
+%define tmp2		reg3
+
+%define lane		reg4
+%define tmp3		reg4
+
+%define extra_blocks	r8
+
+%define tmp		r9
+%define p2		r9
+
+%define lane_data	r10
+
+%endif
+
+; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12
+struc STACK
+_gpr_save:	resq	5
+_rsp_save:	resq	1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+	mov	rax, rsp
+	sub	rsp, STACK_size
+	and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _gpr_save + 8*2], r12
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*3], rsi
+	mov	[rsp + _gpr_save + 8*4], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	movzx	lane, BYTE(unused_lanes)
+	shr	unused_lanes, 8
+	imul	lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+	lea	lane_data, [state + _ldata_sha256 + lane_data]
+	mov	[state + _unused_lanes_sha256], unused_lanes
+	mov	len, [job + _msg_len_to_hash_in_bytes]
+	mov	tmp, len
+	shr	tmp, 6	; divide by 64, len in terms of blocks
+
+	mov	[lane_data + _job_in_lane], job
+	mov	dword [lane_data + _outer_done], 0
+
+        movdqa  xmm0, [state + _lens_sha256]
+        XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+        movdqa  [state + _lens_sha256], xmm0
+
+	mov	last_len, len
+	and	last_len, 63
+	lea	extra_blocks, [last_len + 9 + 63]
+	shr	extra_blocks, 6
+	mov	[lane_data + _extra_blocks], DWORD(extra_blocks)
+
+	mov	p, [job + _src]
+	add	p, [job + _hash_start_src_offset_in_bytes]
+	mov	[state + _args_data_ptr_sha256 + 8*lane], p
+
+	cmp	len, 64
+	jb	copy_lt64
+
+fast_copy:
+	add		p, len
+	movdqu	xmm0, [p - 64 + 0*16]
+	movdqu	xmm1, [p - 64 + 1*16]
+	movdqu	xmm2, [p - 64 + 2*16]
+	movdqu	xmm3, [p - 64 + 3*16]
+	movdqa	[lane_data + _extra_block + 0*16], xmm0
+	movdqa	[lane_data + _extra_block + 1*16], xmm1
+	movdqa	[lane_data + _extra_block + 2*16], xmm2
+	movdqa	[lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+	mov	size_offset, extra_blocks
+	shl	size_offset, 6
+	sub	size_offset, last_len
+	add	size_offset, 64-8
+	mov	[lane_data + _size_offset], DWORD(size_offset)
+	mov	start_offset, 64
+	sub	start_offset, last_len
+	mov	[lane_data + _start_offset], DWORD(start_offset)
+
+	lea	tmp, [8*64 + 8*len]
+	bswap	tmp
+	mov	[lane_data + _extra_block + size_offset], tmp
+
+	mov	tmp, [job + _auth_key_xor_ipad]
+	movdqu	xmm0, [tmp]
+	movdqu	xmm1, [tmp + 4*4]
+	movd	[state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+	pextrd	[state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+	pextrd	[state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+	pextrd	[state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+	movd	[state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+	pextrd	[state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+	pextrd	[state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+	pextrd	[state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+	test	len, ~63
+	jnz	ge64_bytes
+
+lt64_bytes:
+        movdqa  xmm0, [state + _lens_sha256]
+        XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+        movdqa  [state + _lens_sha256], xmm0
+
+	lea	tmp, [lane_data + _extra_block + start_offset]
+	mov	[state + _args_data_ptr_sha256 + 8*lane], tmp
+	mov	dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+	cmp	unused_lanes, 0xff
+	jne	return_null
+	jmp	start_loop
+
+	align	16
+start_loop:
+	; Find min length
+	movdqa	xmm0, [state + _lens_sha256]
+	phminposuw	xmm1, xmm0
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshuflw	xmm1, xmm1, 0
+	psubw	xmm0, xmm1
+	movdqa	[state + _lens_sha256], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha_256_mult_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+	lea	lane_data, [state + _ldata_sha256 + lane_data]
+	mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+	cmp	extra_blocks, 0
+	jne	proc_extra_blocks
+	cmp	dword [lane_data + _outer_done], 0
+	jne	end_loop
+
+proc_outer:
+	mov	dword [lane_data + _outer_done], 1
+	mov	DWORD(size_offset), [lane_data + _size_offset]
+	mov	qword [lane_data + _extra_block + size_offset], 0
+
+        movdqa  xmm0, [state + _lens_sha256]
+        XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+        movdqa  [state + _lens_sha256], xmm0
+
+	lea	tmp, [lane_data + _outer_block]
+	mov	job, [lane_data + _job_in_lane]
+	mov	[state + _args_data_ptr_sha256 + 8*idx], tmp
+
+	movd	xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+	pinsrd	xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+	pinsrd	xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+	pinsrd	xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+	pshufb	xmm0, [rel byteswap]
+	movd	xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+	pinsrd	xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+	pinsrd	xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+	pinsrd	xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+	pshufb	xmm1, [rel byteswap]
+	movdqa	[lane_data + _outer_block], xmm0
+	movdqa	[lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+	mov		dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+
+	mov	tmp, [job + _auth_key_xor_opad]
+	movdqu	xmm0, [tmp]
+	movdqu	xmm1,  [tmp + 4*4]
+	movd	[state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+	pextrd	[state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+	pextrd	[state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+	pextrd	[state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+	movd	[state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+	pextrd	[state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+	pextrd	[state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+	pextrd	[state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+	jmp	start_loop
+
+	align	16
+proc_extra_blocks:
+	mov	DWORD(start_offset), [lane_data + _start_offset]
+
+        movdqa  xmm0, [state + _lens_sha256]
+        XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+        movdqa  [state + _lens_sha256], xmm0
+
+	lea	tmp, [lane_data + _extra_block + start_offset]
+	mov	[state + _args_data_ptr_sha256 + 8*idx], tmp
+	mov	dword [lane_data + _extra_blocks], 0
+	jmp	start_loop
+
+	align	16
+
+copy_lt64:
+	;; less than one message block of data
+	;; beginning of source block
+	;; destination extrablock but backwards by len from where 0x80 pre-populated
+	;; p2 clobbers unused_lanes, undo before exit
+	lea	p2, [lane_data + _extra_block  + 64]
+	sub	p2, len
+	memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	jmp	end_fast_copy
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+	align	16
+end_loop:
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes_sha256]
+	mov	qword [lane_data + _job_in_lane], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes_sha256], unused_lanes
+
+	mov	p, [job_rax + _auth_tag_output]
+
+%ifdef SHA224
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 14
+        jne     copy_full_digest
+%else
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 16
+        jne     copy_full_digest
+%endif
+
+	;; copy 14 bytes for SHA224 / 16 bytes for SHA256
+	mov	DWORD(tmp),  [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+	bswap	DWORD(tmp)
+	bswap	DWORD(tmp2)
+	bswap	DWORD(tmp3)
+	bswap	DWORD(tmp4)
+	mov	[p + 0*4], DWORD(tmp)
+	mov	[p + 1*4], DWORD(tmp2)
+	mov	[p + 2*4], DWORD(tmp3)
+%ifdef SHA224
+	mov	[p + 3*4], WORD(tmp4)
+%else
+	mov	[p + 3*4], DWORD(tmp4)
+%endif
+        jmp     clear_ret
+
+copy_full_digest:
+	;; copy 28 bytes for SHA224 / 32 bytes for SHA256
+	mov	DWORD(tmp),  [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+	bswap	DWORD(tmp)
+	bswap	DWORD(tmp2)
+	bswap	DWORD(tmp3)
+	bswap	DWORD(tmp4)
+	mov	[p + 0*4], DWORD(tmp)
+	mov	[p + 1*4], DWORD(tmp2)
+	mov	[p + 2*4], DWORD(tmp3)
+	mov	[p + 3*4], DWORD(tmp4)
+
+	mov	DWORD(tmp),  [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+	mov	DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+	mov	DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+	bswap	DWORD(tmp)
+	bswap	DWORD(tmp2)
+	bswap	DWORD(tmp3)
+%ifndef SHA224
+	bswap	DWORD(tmp4)
+%endif
+	mov	[p + 4*4], DWORD(tmp)
+	mov	[p + 5*4], DWORD(tmp2)
+	mov	[p + 6*4], DWORD(tmp3)
+%ifndef SHA224
+	mov	[p + 7*4], DWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) of returned job
+%assign J 0
+%rep 7
+        mov     dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+        mov     dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+        pxor    xmm0, xmm0
+        imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+        lea	lane_data, [state + _ldata_sha256 + lane_data]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+%ifdef SHA224
+        mov     qword [lane_data + _outer_block + 16], 0
+        mov     dword [lane_data + _outer_block + 24], 0
+%else
+        movdqa  [lane_data + _outer_block + 16], xmm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	r12, [rsp + _gpr_save + 8*2]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*3]
+	mov	rdi, [rsp + _gpr_save + 8*4]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm
new file mode 100644
index 000000000..bc7305001
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_384_sse
+%define SHA_X_DIGEST_SIZE 384
+
+%include "sse/mb_mgr_hmac_sha_512_flush_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm
new file mode 100644
index 000000000..04d7d3aaf
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_384_sse
+%define SHA_X_DIGEST_SIZE 384
+
+%include "sse/mb_mgr_hmac_sha_512_submit_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm
new file mode 100644
index 000000000..40f61fa4d
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm
@@ -0,0 +1,331 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha512_x2_sse
+
+section .data
+default rel
+align 16
+byteswap:	;ddq 0x08090a0b0c0d0e0f0001020304050607
+	dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+len_masks:
+	;ddq 0x0000000000000000000000000000FFFF
+	dq 0x000000000000FFFF, 0x0000000000000000
+	;ddq 0x000000000000000000000000FFFF0000
+	dq 0x00000000FFFF0000, 0x0000000000000000
+one:	dq  1
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_512_sse
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx		rbp
+
+%define unused_lanes	rbx
+%define lane_data	rbx
+%define tmp2		rbx
+
+%define job_rax		rax
+%define	tmp1		rax
+%define size_offset	rax
+%define tmp		rax
+%define start_offset	rax
+
+%define tmp3		arg1
+
+%define extra_blocks	arg2
+%define p		arg2
+
+%define tmp4		r8
+
+%define tmp5		r9
+
+%define tmp6		r10
+
+%endif
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save:	resq	2
+_rsp_save:	resq	1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+	mov	rax, rsp
+	sub	rsp, STACK_size
+	and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+	mov	[rsp + _rsp_save], rax	; original SP
+
+	mov	unused_lanes, [state + _unused_lanes_sha512]
+	bt	unused_lanes, 16+7
+	jc	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+	cmovne	idx, [rel one]
+copy_lane_data:
+	; copy good lane (idx) to empty lanes
+	movdqa	xmm0, [state + _lens_sha512]
+	mov	tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx]
+
+%assign I 0
+%rep 2
+	cmp	qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp
+	por	xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	movdqa	[state + _lens_sha512], xmm0
+
+	phminposuw	xmm1, xmm0
+	pextrw	len2, xmm1, 0	; min value
+	pextrw	idx, xmm1, 1	; min index (0...3)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshuflw	xmm1, xmm1, 0xA0
+	psubw	xmm0, xmm1
+	movdqa	[state + _lens_sha512], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha512_x2_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _SHA512_LANE_DATA_size
+	lea	lane_data, [state + _ldata_sha512 + lane_data]
+	mov	DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+	cmp	extra_blocks, 0
+	jne	proc_extra_blocks
+	cmp	dword [lane_data + _outer_done_sha512], 0
+	jne	end_loop
+
+proc_outer:
+	mov	dword [lane_data + _outer_done_sha512], 1
+	mov	DWORD(size_offset), [lane_data + _size_offset_sha512]
+	mov	qword [lane_data + _extra_block_sha512 + size_offset], 0
+	mov	word [state + _lens_sha512 + 2*idx], 1
+	lea	tmp, [lane_data + _outer_block_sha512]
+	mov	job, [lane_data + _job_in_lane_sha512]
+	mov	[state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+%assign I 0
+%rep (SHA_X_DIGEST_SIZE / (8*16))
+	movq	xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE]
+	pinsrq	xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1) *SHA512_DIGEST_ROW_SIZE], 1
+	pshufb	xmm0, [rel byteswap]
+	movdqa	[lane_data + _outer_block_sha512 + I*16], xmm0
+%assign I (I+1)
+%endrep
+
+	mov	tmp, [job + _auth_key_xor_opad]
+%assign I 0
+%rep 4
+	movdqu	xmm0, [tmp + I * 16]
+	movq	[state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0
+	pextrq	[state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+	jmp	copy_lane_data
+
+	align	16
+proc_extra_blocks:
+	mov	DWORD(start_offset), [lane_data + _start_offset_sha512]
+	mov	[state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+	lea	tmp, [lane_data + _extra_block_sha512 + start_offset]
+	mov	[state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+	mov	dword [lane_data + _extra_blocks_sha512], 0
+	jmp	copy_lane_data
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+	align	16
+end_loop:
+	mov	job_rax, [lane_data + _job_in_lane_sha512]
+	mov	qword [lane_data + _job_in_lane_sha512], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	mov	unused_lanes, [state + _unused_lanes_sha512]
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes_sha512], unused_lanes
+
+	mov	p, [job_rax + _auth_tag_output]
+
+%if (SHA_X_DIGEST_SIZE != 384)
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 32
+        jne     copy_full_digest
+%else
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 24
+        jne     copy_full_digest
+%endif
+	;; copy 32 bytes for SHA512 // 24 bytes for SHA384
+	mov	QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+	mov	QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+	bswap	QWORD(tmp2)
+	bswap	QWORD(tmp4)
+	bswap	QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+	bswap	QWORD(tmp5)
+%endif
+	mov	[p + 0*8], QWORD(tmp2)
+	mov	[p + 1*8], QWORD(tmp4)
+	mov	[p + 2*8], QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+	mov	[p + 3*8], QWORD(tmp5)
+%endif
+        jmp     clear_ret
+copy_full_digest:
+	;; copy 32 bytes for SHA512 // 24 bytes for SHA384
+	mov	QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+	bswap	QWORD(tmp2)
+	bswap	QWORD(tmp4)
+	bswap	QWORD(tmp6)
+	bswap	QWORD(tmp5)
+	mov	[p + 0*8], QWORD(tmp2)
+	mov	[p + 1*8], QWORD(tmp4)
+	mov	[p + 2*8], QWORD(tmp6)
+	mov	[p + 3*8], QWORD(tmp5)
+
+	mov	QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+	mov	QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+	bswap	QWORD(tmp2)
+	bswap	QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+	bswap	QWORD(tmp6)
+	bswap	QWORD(tmp5)
+%endif
+	mov	[p + 4*8], QWORD(tmp2)
+	mov	[p + 5*8], QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+	mov	[p + 6*8], QWORD(tmp6)
+	mov	[p + 7*8], QWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+
+        ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign I 0
+%rep 2
+	cmp	qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0
+	jne	APPEND(skip_clear_,I)
+
+        ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes)
+%assign J 0
+%rep 6
+        mov     qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+        mov     qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 6*SHA512_DIGEST_ROW_SIZE], 0
+        mov     qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 7*SHA512_DIGEST_ROW_SIZE], 0
+%endif
+
+        lea     lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)]
+        ;; Clear first 128 bytes of extra_block
+%assign offset 0
+%rep 8
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+        movdqa  [lane_data + _outer_block + 16], xmm0
+        movdqa  [lane_data + _outer_block + 32], xmm0
+%if (SHA_X_DIGEST_SIZE != 384)
+        movdqa  [lane_data + _outer_block + 48], xmm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+	mov	rsp, [rsp + _rsp_save]	; original SP
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm
new file mode 100644
index 000000000..0d6da7bce
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm
@@ -0,0 +1,412 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha512_x2_sse
+
+section .data
+default rel
+align 16
+byteswap:	;ddq 0x08090a0b0c0d0e0f0001020304050607
+	dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_512_sse
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define reg3	rcx
+%define reg4	rdx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define reg3	rdi
+%define reg4	rsi
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len	rbp
+%define idx		rbp
+
+%define p		r11
+%define start_offset	r11
+
+%define unused_lanes	rbx
+%define tmp4		rbx
+
+%define job_rax		rax
+%define len		rax
+
+%define size_offset	reg3
+%define tmp2		reg3
+
+%define lane		reg4
+%define tmp3		reg4
+
+%define extra_blocks	r8
+
+%define tmp		r9
+%define p2		r9
+
+%define lane_data	r10
+
+%endif
+
+; This routine clobbers rbx, rbp, rsi, rdi
+struc STACK
+_gpr_save:	resq	4
+_rsp_save:	resq	1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+	mov	rax, rsp
+	sub	rsp, STACK_size
+	and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*2], rsi
+	mov	[rsp + _gpr_save + 8*3], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+	mov	unused_lanes, [state + _unused_lanes_sha512]
+	movzx	lane, BYTE(unused_lanes)
+	shr	unused_lanes, 8
+	imul	lane_data, lane, _SHA512_LANE_DATA_size
+	lea	lane_data, [state + _ldata_sha512+ lane_data]
+	mov	[state + _unused_lanes_sha512], unused_lanes
+	mov	len, [job + _msg_len_to_hash_in_bytes]
+	mov	tmp, len
+	shr	tmp, 7	; divide by 128, len in terms of sha512 blocks
+
+	mov	[lane_data + _job_in_lane_sha512], job
+	mov	dword [lane_data + _outer_done_sha512], 0
+
+        movdqa  xmm0, [state + _lens_sha512]
+        XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+        movdqa  [state + _lens_sha512], xmm0
+
+	mov	last_len, len
+	and	last_len, 127
+	lea	extra_blocks, [last_len + 17 + 127]
+	shr	extra_blocks, 7
+	mov	[lane_data + _extra_blocks_sha512], DWORD(extra_blocks)
+
+	mov	p, [job + _src]
+	add	p, [job + _hash_start_src_offset_in_bytes]
+	mov	[state + _args_data_ptr_sha512 + PTR_SZ*lane], p
+
+	cmp	len, 128
+	jb	copy_lt128
+
+fast_copy:
+	add	p, len
+%assign I 0
+%rep 2
+	movdqu	xmm0, [p - 128 + I*4*16 + 0*16]
+	movdqu	xmm1, [p - 128 + I*4*16 + 1*16]
+	movdqu	xmm2, [p - 128 + I*4*16 + 2*16]
+	movdqu	xmm3, [p - 128 + I*4*16 + 3*16]
+	movdqa	[lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0
+	movdqa	[lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1
+	movdqa	[lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2
+	movdqa	[lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3
+%assign I (I+1)
+%endrep
+end_fast_copy:
+
+	mov	size_offset, extra_blocks
+	shl	size_offset, 7
+	sub	size_offset, last_len
+	add	size_offset, 128-8
+	mov	[lane_data + _size_offset_sha512], DWORD(size_offset)
+	mov	start_offset, 128
+	sub	start_offset, last_len
+	mov	[lane_data + _start_offset_sha512], DWORD(start_offset)
+
+	lea	tmp, [8*128 + 8*len]
+	bswap	tmp
+	mov	[lane_data + _extra_block_sha512 + size_offset], tmp
+
+	mov	tmp, [job + _auth_key_xor_ipad]
+ %assign I 0
+ %rep 4
+	movdqu	xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE]
+	movq	[state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0
+	pextrq	[state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+ %assign I (I+1)
+ %endrep
+	test	len, ~127
+	jnz	ge128_bytes
+
+lt128_bytes:
+        movdqa  xmm0, [state + _lens_sha512]
+        XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+        movdqa  [state + _lens_sha512], xmm0
+
+	lea	tmp, [lane_data + _extra_block_sha512 + start_offset]
+	mov	[state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8
+	mov	dword [lane_data + _extra_blocks_sha512], 0
+
+ge128_bytes:
+	cmp	unused_lanes, 0xff
+	jne	return_null
+	jmp	start_loop
+
+	align	16
+start_loop:
+	; Find min length
+	movdqa	xmm0, [state + _lens_sha512]
+	phminposuw	xmm1, xmm0
+	pextrw	DWORD(len2), xmm1, 0	; min value
+	pextrw	DWORD(idx), xmm1, 1	; min index (0...1)
+	cmp	len2, 0
+	je	len_is_0
+
+	pshuflw	xmm1, xmm1, 0XA0
+	psubw	xmm0, xmm1
+	movdqa	[state + _lens_sha512], xmm0
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha512_x2_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _SHA512_LANE_DATA_size
+	lea	lane_data, [state + _ldata_sha512 + lane_data]
+	mov	DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+	cmp	extra_blocks, 0
+	jne	proc_extra_blocks
+	cmp	dword [lane_data + _outer_done_sha512], 0
+	jne	end_loop
+
+proc_outer:
+	mov	dword [lane_data + _outer_done_sha512], 1
+	mov	DWORD(size_offset), [lane_data + _size_offset_sha512]
+	mov	qword [lane_data + _extra_block_sha512 + size_offset], 0
+
+        movdqa  xmm0, [state + _lens_sha512]
+        XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+        movdqa  [state + _lens_sha512], xmm0
+
+	lea	tmp, [lane_data + _outer_block_sha512]
+	mov	job, [lane_data + _job_in_lane_sha512]
+	mov	[state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+%assign I 0
+%rep (SHA_X_DIGEST_SIZE / (8 * 16))
+	movq	xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE]
+	pinsrq	xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+	pshufb	xmm0, [rel byteswap]
+	movdqa	[lane_data + _outer_block_sha512 + I*16], xmm0
+%assign I (I+1)
+%endrep
+
+	mov	tmp, [job + _auth_key_xor_opad]
+%assign I 0
+%rep 4
+	movdqu	xmm0, [tmp + I*16]
+	movq	[state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0
+	pextrq	[state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+	jmp	start_loop
+
+	align	16
+proc_extra_blocks:
+	mov	DWORD(start_offset), [lane_data + _start_offset_sha512]
+
+        movdqa  xmm0, [state + _lens_sha512]
+        XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+        movdqa  [state + _lens_sha512], xmm0
+
+	lea	tmp, [lane_data + _extra_block_sha512 + start_offset]
+	mov	[state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+	mov	dword [lane_data + _extra_blocks_sha512], 0
+	jmp	start_loop
+
+	align	16
+copy_lt128:
+	;; less than one message block of data
+	;; beginning of source block
+	;; destination extra block but backwards by len from where 0x80 pre-populated
+	lea	p2, [lane_data + _extra_block  + 128]
+	sub	p2, len
+	memcpy_sse_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+	mov	unused_lanes, [state + _unused_lanes_sha512]
+	jmp	end_fast_copy
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+	align	16
+end_loop:
+	mov	job_rax, [lane_data + _job_in_lane_sha512]
+	mov	unused_lanes, [state + _unused_lanes_sha512]
+	mov	qword [lane_data + _job_in_lane_sha512], 0
+	or	dword [job_rax + _status], STS_COMPLETED_HMAC
+	shl	unused_lanes, 8
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes_sha512], unused_lanes
+
+	mov	p, [job_rax + _auth_tag_output]
+
+%if (SHA_X_DIGEST_SIZE != 384)
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 32
+        jne     copy_full_digest
+%else
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 24
+        jne     copy_full_digest
+%endif
+
+	;; copy 32 bytes for SHA512 / 24 bytes for SHA384
+	mov	QWORD(tmp),  [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+	mov	QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] ; this line of code will run only for SHA512
+%endif
+	bswap	QWORD(tmp)
+	bswap	QWORD(tmp2)
+	bswap	QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+	bswap	QWORD(tmp4)
+%endif
+	mov	[p + 0*8], QWORD(tmp)
+	mov	[p + 1*8], QWORD(tmp2)
+	mov	[p + 2*8], QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+	mov	[p + 3*8], QWORD(tmp4)
+%endif
+        jmp     clear_ret
+
+copy_full_digest:
+	;; copy 64 bytes for SHA512 / 48 bytes for SHA384
+	mov	QWORD(tmp),  [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] ; this line of code will run only for SHA512
+	bswap	QWORD(tmp)
+	bswap	QWORD(tmp2)
+	bswap	QWORD(tmp3)
+	bswap	QWORD(tmp4)
+	mov	[p + 0*8], QWORD(tmp)
+	mov	[p + 1*8], QWORD(tmp2)
+	mov	[p + 2*8], QWORD(tmp3)
+	mov	[p + 3*8], QWORD(tmp4)
+	mov	QWORD(tmp),  [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+	mov	QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+	mov	QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE] ; this line of code will run only for SHA512
+%endif
+	bswap	QWORD(tmp)
+	bswap	QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+	bswap	QWORD(tmp3)
+	bswap	QWORD(tmp4)
+%endif
+	mov	[p + 4*8], QWORD(tmp)
+	mov	[p + 5*8], QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+	mov	[p + 6*8], QWORD(tmp3)
+	mov	[p + 7*8], QWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign J 0
+%rep 6
+        mov     qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+        mov     qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0
+        mov     qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+        pxor    xmm0, xmm0
+        imul	lane_data, idx, _SHA512_LANE_DATA_size
+        lea	lane_data, [state + _ldata_sha512 + lane_data]
+        ;; Clear first 128 bytes of extra_block
+%assign offset 0
+%rep 8
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+        movdqa  [lane_data + _outer_block + 16], xmm0
+        movdqa  [lane_data + _outer_block + 32], xmm0
+%if (SHA_X_DIGEST_SIZE != 384)
+        movdqa  [lane_data + _outer_block + 48], xmm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*2]
+	mov	rdi, [rsp + _gpr_save + 8*3]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm
new file mode 100644
index 000000000..e0b0460f4
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm
@@ -0,0 +1,370 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;;	calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;;	calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers:		RAX RBX RCX RDX RBP RSI RDI R8  R9  R10 R11 R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Windows clobbers:	RAX     RCX RDX             R8  R9  R10 R11
+;; Windows preserves:	    RBX         RBP RSI RDI                 R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Linux clobbers:	RAX     RCX RDX     RSI RDI R8  R9  R10 R11
+;; Linux preserves:	    RBX         RBP                         R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha1_ni
+
+section .data
+default rel
+
+align 16
+byteswap:
+	dq 0x0405060700010203
+	dq 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define reg3	rcx
+%define reg4	rdx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define reg3	rdi
+%define reg4	rsi
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len        rbp
+%define idx             rbp
+%define p4              rbp
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+%define tmp4            rbx
+%define p3		rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define size_offset     reg3
+%define tmp2		reg3
+
+%define lane            reg4
+%define tmp3		reg4
+
+%define extra_blocks    r8
+
+%define tmp             r9
+%define p2              r9
+
+%define lane_data       r10
+
+struc STACK
+_gpr_save:	resq	4
+_rsp_save:	resq	1
+endstruc
+
+; JOB* submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_ni_sse,function,internal)
+submit_job_hmac_ni_sse:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*2], rsi
+	mov	[rsp + _gpr_save + 8*3], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+        DBGPRINTL "enter sha1-ni-sse submit"
+        mov	unused_lanes, [state + _unused_lanes]
+        movzx	lane, BYTE(unused_lanes)
+	DBGPRINTL64 "lane: ", lane
+        shr	unused_lanes, 8
+        imul	lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+        lea	lane_data, [state + _ldata + lane_data]
+        mov	[state + _unused_lanes], unused_lanes
+        mov	len, [job + _msg_len_to_hash_in_bytes]
+	DBGPRINTL64 "length: ", len
+        mov	tmp, len
+        shr	tmp, 6	; divide by 64, len in terms of blocks
+
+        mov	[lane_data + _job_in_lane], job
+        mov	dword [lane_data + _outer_done], 0
+        mov	[state + _lens + 2*lane], WORD(tmp)
+
+        mov	last_len, len
+        and	last_len, 63
+        lea	extra_blocks, [last_len + 9 + 63]
+        shr	extra_blocks, 6
+        mov	[lane_data + _extra_blocks], DWORD(extra_blocks)
+
+        mov	p, [job + _src]
+        add	p, [job + _hash_start_src_offset_in_bytes]
+	DBGPRINTL64 "src pointer + offset:", p
+        mov	[state + _args_data_ptr + PTR_SZ*lane], p
+        cmp	len, 64
+        jb	copy_lt64
+
+fast_copy:
+        add	p, len
+        movdqu	xmm0, [p - 64 + 0*16]
+        movdqu	xmm1, [p - 64 + 1*16]
+        movdqu	xmm2, [p - 64 + 2*16]
+        movdqu	xmm3, [p - 64 + 3*16]
+        movdqa	[lane_data + _extra_block + 0*16], xmm0
+        movdqa	[lane_data + _extra_block + 1*16], xmm1
+        movdqa	[lane_data + _extra_block + 2*16], xmm2
+        movdqa	[lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+        mov	size_offset, extra_blocks
+        shl	size_offset, 6
+        sub	size_offset, last_len
+        add	size_offset, 64-8
+        mov	[lane_data + _size_offset], DWORD(size_offset)
+        mov	start_offset, 64
+        sub	start_offset, last_len
+        mov	[lane_data + _start_offset], DWORD(start_offset)
+
+        lea	tmp, [8*64 + 8*len]
+        bswap	tmp
+        mov	[lane_data + _extra_block + size_offset], tmp
+
+        mov	tmp, [job + _auth_key_xor_ipad]
+        movdqu	xmm0, [tmp]
+        mov	DWORD(tmp),  [tmp + 4*SHA1_DIGEST_WORD_SIZE]
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+	lea	p4, [lane + lane*4]
+        movdqu	[state + _args_digest + p4*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0
+        mov	[state + _args_digest + p4*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+        test	len, ~63
+        jnz	ge64_bytes
+
+lt64_bytes:
+        mov	[state + _lens + 2*lane], WORD(extra_blocks)
+        lea	tmp, [lane_data + _extra_block + start_offset]
+        mov	[state + _args_data_ptr + PTR_SZ*lane], tmp
+        mov	dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+        cmp	unused_lanes, 0xff
+        jne	return_null
+        jmp	start_loop
+
+        align	16
+start_loop:
+	; Find min length - only two lanes available
+	xor     len2, len2
+	mov	p3, 0x10000
+	mov	WORD(len2), word [state + _lens + 0*2]	; [0:15] - lane 0 length, [16:31] - lane index (0)
+	mov	WORD(p3), word [state + _lens + 1*2]	; [0:15] - lane 1 length, [16:31] - lane index (1)
+	cmp     WORD(len2), WORD(p3)
+	cmovg	DWORD(len2), DWORD(p3)	; move if lane 0 length is greater than lane 1 length
+
+	mov	idx, len2		; retrieve index & length from [16:31] and [0:15] bit fields
+	shr	DWORD(idx), 16
+	and	DWORD(len2), 0xffff
+	je	len_is_0
+
+	sub	word [state + _lens + 0*2], WORD(len2)
+	sub	word [state + _lens + 1*2], WORD(len2)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+        call	sha1_ni
+        ; state is intact
+
+len_is_0:
+        ; process completed job "idx"
+        imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+        lea	lane_data, [state + _ldata + lane_data]
+        mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+        cmp	extra_blocks, 0
+        jne	proc_extra_blocks
+        cmp	dword [lane_data + _outer_done], 0
+        jne	end_loop
+
+proc_outer:
+        mov	dword [lane_data + _outer_done], 1
+        mov	DWORD(size_offset), [lane_data + _size_offset]
+        mov	qword [lane_data + _extra_block + size_offset], 0
+        mov	word [state + _lens + 2*idx], 1
+        lea	tmp, [lane_data + _outer_block]
+        mov	job, [lane_data + _job_in_lane]
+        mov	[state + _args_data_ptr + PTR_SZ*idx], tmp
+
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error	"Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+	lea	p3, [idx + idx*4]
+        movdqu	xmm0, [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE]
+        pshufb	xmm0, [rel byteswap]
+        mov	DWORD(tmp),  [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE]
+        bswap	DWORD(tmp)
+        movdqa	[lane_data + _outer_block], xmm0
+        mov	[lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+        mov	tmp, [job + _auth_key_xor_opad]
+        movdqu	xmm0, [tmp]
+        mov	DWORD(tmp),  [tmp + 4*SHA1_DIGEST_WORD_SIZE]
+        movdqu	[state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0
+        mov	[state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+        jmp	start_loop
+
+        align	16
+proc_extra_blocks:
+        mov	DWORD(start_offset), [lane_data + _start_offset]
+        mov	[state + _lens + 2*idx], WORD(extra_blocks)
+        lea	tmp, [lane_data + _extra_block + start_offset]
+        mov	[state + _args_data_ptr + PTR_SZ*idx], tmp
+        mov	dword [lane_data + _extra_blocks], 0
+        jmp	start_loop
+
+        align	16
+copy_lt64:
+        ;; less than one message block of data
+        ;; beginning of source block
+        ;; destination extrablock but backwards by len from where 0x80 pre-populated
+        lea	p2, [lane_data + _extra_block  + 64]
+        sub     p2, len
+        memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+        mov	unused_lanes, [state + _unused_lanes]
+        jmp	end_fast_copy
+
+return_null:
+        xor	job_rax, job_rax
+        jmp	return
+
+        align	16
+end_loop:
+        mov	job_rax, [lane_data + _job_in_lane]
+        mov	unused_lanes, [state + _unused_lanes]
+        mov	qword [lane_data + _job_in_lane], 0
+        or	dword [job_rax + _status], STS_COMPLETED_HMAC
+        shl	unused_lanes, 8
+        or	unused_lanes, idx
+        mov	[state + _unused_lanes], unused_lanes
+
+        mov	p, [job_rax + _auth_tag_output]
+
+        ; copy 12 bytes
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error	"Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+	lea	idx, [idx + 4*idx]
+        mov	DWORD(tmp),  [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE]
+        mov	DWORD(tmp2), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE]
+        mov	DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE]
+        bswap	DWORD(tmp)
+        bswap	DWORD(tmp2)
+        bswap	DWORD(tmp3)
+        mov	[p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+        mov	[p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+        mov	[p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
+
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 12
+        je      clear_ret
+
+        ;; copy remaining 8 bytes to return 20 byte digest
+        mov	DWORD(tmp),  [state + _args_digest + idx*4 + 3*SHA1_DIGEST_WORD_SIZE]
+        mov	DWORD(tmp2), [state + _args_digest + idx*4 + 4*SHA1_DIGEST_WORD_SIZE]
+        bswap	DWORD(tmp)
+        bswap	DWORD(tmp2)
+        mov	[p + 3*4], DWORD(tmp)
+        mov	[p + 4*4], DWORD(tmp2)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+        ;; Clear digest (20B), outer_block (20B) and extra_block (64B)
+        ;; idx = 0 or 5 (depending on lane)
+        movdqu  [state + _args_digest + idx*4], xmm0
+        mov     dword [state + _args_digest + idx*4 + 16], 0
+
+        shr     idx, 2 ;; idx == 5 ? 1 : 0
+        imul    lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+        lea     lane_data, [state + _ldata + lane_data]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear 20 bytes of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+        mov     dword [lane_data + _outer_block + 16], 0
+%endif
+
+return:
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*2]
+	mov	rdi, [rsp + _gpr_save + 8*3]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+        ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm
new file mode 100644
index 000000000..bc59e7943
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm
@@ -0,0 +1,364 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha1_mult_sse
+
+section .data
+default rel
+
+align 16
+byteswap:	;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define reg3	rcx
+%define reg4	rdx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define reg3	rdi
+%define reg4	rsi
+%endif
+
+%define state	arg1
+%define job	arg2
+%define len2	arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len        rbp
+%define idx             rbp
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+%define tmp4            rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define size_offset     reg3
+%define tmp2		reg3
+
+%define lane            reg4
+%define tmp3		reg4
+
+%define extra_blocks    r8
+
+%define tmp             r9
+%define p2              r9
+
+%define lane_data       r10
+
+%endif
+
+; This routine clobbers rdi, rsi, rbx, rbp
+struc STACK
+_gpr_save:	resq	4
+_rsp_save:	resq	1
+endstruc
+
+; JOB* submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_sse,function, internal)
+submit_job_hmac_sse:
+
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+
+	mov	[rsp + _gpr_save + 8*0], rbx
+	mov	[rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+	mov	[rsp + _gpr_save + 8*2], rsi
+	mov	[rsp + _gpr_save + 8*3], rdi
+%endif
+	mov	[rsp + _rsp_save], rax	; original SP
+
+        DBGPRINTL "enter sha1-sse submit"
+        mov	unused_lanes, [state + _unused_lanes]
+        movzx	lane, BYTE(unused_lanes)
+        shr	unused_lanes, 8
+        imul	lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+        lea	lane_data, [state + _ldata + lane_data]
+        mov	[state + _unused_lanes], unused_lanes
+        mov	len, [job + _msg_len_to_hash_in_bytes]
+        mov	tmp, len
+        shr	tmp, 6	; divide by 64, len in terms of blocks
+
+        mov	[lane_data + _job_in_lane], job
+        mov	dword [lane_data + _outer_done], 0
+
+        movdqa  xmm0, [state + _lens]
+        XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+        movdqa  [state + _lens], xmm0
+
+        mov	last_len, len
+        and	last_len, 63
+        lea	extra_blocks, [last_len + 9 + 63]
+        shr	extra_blocks, 6
+        mov	[lane_data + _extra_blocks], DWORD(extra_blocks)
+
+        mov	p, [job + _src]
+        add	p, [job + _hash_start_src_offset_in_bytes]
+        mov	[state + _args_data_ptr + PTR_SZ*lane], p
+        cmp	len, 64
+        jb	copy_lt64
+
+fast_copy:
+        add	p, len
+        movdqu	xmm0, [p - 64 + 0*16]
+        movdqu	xmm1, [p - 64 + 1*16]
+        movdqu	xmm2, [p - 64 + 2*16]
+        movdqu	xmm3, [p - 64 + 3*16]
+        movdqa	[lane_data + _extra_block + 0*16], xmm0
+        movdqa	[lane_data + _extra_block + 1*16], xmm1
+        movdqa	[lane_data + _extra_block + 2*16], xmm2
+        movdqa	[lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+        mov	size_offset, extra_blocks
+        shl	size_offset, 6
+        sub	size_offset, last_len
+        add	size_offset, 64-8
+        mov	[lane_data + _size_offset], DWORD(size_offset)
+        mov	start_offset, 64
+        sub	start_offset, last_len
+        mov	[lane_data + _start_offset], DWORD(start_offset)
+
+        lea	tmp, [8*64 + 8*len]
+        bswap	tmp
+        mov	[lane_data + _extra_block + size_offset], tmp
+
+        mov	tmp, [job + _auth_key_xor_ipad]
+        movdqu	xmm0, [tmp]
+        mov	DWORD(tmp),  [tmp + 4*4]
+        movd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+        pextrd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+        pextrd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+        pextrd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+        mov	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+        test	len, ~63
+        jnz	ge64_bytes
+
+lt64_bytes:
+        movdqa  xmm0, [state + _lens]
+        XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+        movdqa  [state + _lens], xmm0
+
+        lea	tmp, [lane_data + _extra_block + start_offset]
+        mov	[state + _args_data_ptr + PTR_SZ*lane], tmp
+        mov	dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+        cmp	unused_lanes, 0xff
+        jne	return_null
+        movdqa  xmm0, [state + _lens]
+        jmp	start_loop
+
+        align	16
+start_loop:
+        ; Find min length
+        phminposuw	xmm1, xmm0
+        pextrw	len2, xmm1, 0	; min value
+        pextrw	idx, xmm1, 1	; min index (0...3)
+        cmp	len2, 0
+        je	len_is_0
+
+        pshuflw	xmm1, xmm1, 0
+        psubw	xmm0, xmm1
+        movdqa	[state + _lens], xmm0
+
+        ; "state" and "args" are the same address, arg1
+        ; len is arg2
+        call	sha1_mult_sse
+        ; state is intact
+
+len_is_0:
+        ; process completed job "idx"
+        imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+        lea	lane_data, [state + _ldata + lane_data]
+        mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
+        cmp	extra_blocks, 0
+        jne	proc_extra_blocks
+        cmp	dword [lane_data + _outer_done], 0
+        jne	end_loop
+
+proc_outer:
+        mov	dword [lane_data + _outer_done], 1
+        mov	DWORD(size_offset), [lane_data + _size_offset]
+        mov	qword [lane_data + _extra_block + size_offset], 0
+
+        movdqa  xmm1, [state + _lens]
+        XPINSRW xmm1, xmm2, tmp, idx, 1, scale_x16
+        movdqa  [state + _lens], xmm1
+
+        lea	tmp, [lane_data + _outer_block]
+        mov	job, [lane_data + _job_in_lane]
+        mov	[state + _args_data_ptr + PTR_SZ*idx], tmp
+
+        movd	xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+        pinsrd	xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+        pinsrd	xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+        pinsrd	xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+        pshufb	xmm0, [rel byteswap]
+        mov	DWORD(tmp),  [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+        bswap	DWORD(tmp)
+        movdqa	[lane_data + _outer_block], xmm0
+        mov	[lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+        mov	tmp, [job + _auth_key_xor_opad]
+        movdqu	xmm0, [tmp]
+        mov	DWORD(tmp),  [tmp + 4*4]
+        movd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+        pextrd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+        pextrd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+        pextrd	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+        mov	[state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+        movdqa  xmm0, xmm1
+        jmp	start_loop
+
+        align	16
+proc_extra_blocks:
+        mov	DWORD(start_offset), [lane_data + _start_offset]
+
+        movdqa  xmm0, [state + _lens]
+        XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+        movdqa  [state + _lens], xmm0
+
+        lea	tmp, [lane_data + _extra_block + start_offset]
+        mov	[state + _args_data_ptr + PTR_SZ*idx], tmp
+        mov	dword [lane_data + _extra_blocks], 0
+        jmp	start_loop
+
+        align	16
+copy_lt64:
+        ;; less than one message block of data
+        ;; beginning of source block
+        ;; destination extrablock but backwards by len from where 0x80 pre-populated
+        lea	p2, [lane_data + _extra_block  + 64]
+        sub     p2, len
+        memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+        mov	unused_lanes, [state + _unused_lanes]
+        jmp	end_fast_copy
+
+return_null:
+        xor	job_rax, job_rax
+        jmp	return
+
+        align	16
+end_loop:
+        mov	job_rax, [lane_data + _job_in_lane]
+        mov	unused_lanes, [state + _unused_lanes]
+        mov	qword [lane_data + _job_in_lane], 0
+        or	dword [job_rax + _status], STS_COMPLETED_HMAC
+        shl	unused_lanes, 8
+        or	unused_lanes, idx
+        mov	[state + _unused_lanes], unused_lanes
+
+        mov	p, [job_rax + _auth_tag_output]
+
+        ; copy 12 bytes
+        mov	DWORD(tmp),  [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+        mov	DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+        mov	DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+        bswap	DWORD(tmp)
+        bswap	DWORD(tmp2)
+        bswap	DWORD(tmp3)
+        mov	[p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+        mov	[p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+        mov	[p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
+
+        cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 12
+        je      clear_ret
+
+        ;; copy remaining 8 bytes to return 20 byte digest
+        mov	DWORD(tmp),  [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+        mov	DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+        bswap	DWORD(tmp)
+        bswap	DWORD(tmp2)
+        mov	[p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+        mov	[p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+        ;; Clear digest (20B), outer_block (20B) and extra_block (64B) of returned job
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 0
+        mov     dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+        pxor    xmm0, xmm0
+        imul    lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+        lea     lane_data, [state + _ldata + lane_data]
+        ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+        movdqa  [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+        ;; Clear first 20 bytes of outer_block
+        movdqa  [lane_data + _outer_block], xmm0
+        mov     dword [lane_data + _outer_block + 16], 0
+%endif
+
+return:
+
+	mov	rbx, [rsp + _gpr_save + 8*0]
+	mov	rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+	mov	rsi, [rsp + _gpr_save + 8*2]
+	mov	rdi, [rsp + _gpr_save + 8*3]
+%endif
+	mov	rsp, [rsp + _rsp_save]	; original SP
+
+        ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c b/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c
new file mode 100644
index 000000000..4d862cba2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c
@@ -0,0 +1,809 @@
+/*******************************************************************************
+  Copyright (c) 2012-2018, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_sse
+
+#include "intel-ipsec-mb.h"
+#include "include/kasumi_internal.h"
+#include "include/zuc_internal.h"
+#include "include/snow3g.h"
+
+#include "save_xmms.h"
+#include "asm.h"
+#include "des.h"
+#include "cpu_feature.h"
+#include "noaesni.h"
+
+JOB_AES_HMAC *submit_job_aes128_enc_sse(MB_MGR_AES_OOO *state,
+                                        JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes128_enc_sse(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes192_enc_sse(MB_MGR_AES_OOO *state,
+                                        JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes192_enc_sse(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes256_enc_sse(MB_MGR_AES_OOO *state,
+                                        JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes256_enc_sse(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state,
+                                  JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state,
+                                     JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state,
+                                          JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state,
+                                             JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state,
+                                          JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state,
+                                             JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state,
+                                          JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state,
+                                          JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state,
+                                      JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state);
+
+
+JOB_AES_HMAC *submit_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state,
+                                      JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state,
+                                           JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_ccm_auth_sse(MB_MGR_CCM_OOO *state,
+                                           JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_ccm_auth_sse(MB_MGR_CCM_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cntr_sse(JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *submit_job_aes_cntr_bit_sse(JOB_AES_HMAC *job);
+
+#define SAVE_XMMS               save_xmms
+#define RESTORE_XMMS            restore_xmms
+
+#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_sse
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse
+#define FLUSH_JOB_AES128_ENC  flush_job_aes128_enc_sse
+#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_sse
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse
+#define FLUSH_JOB_AES192_ENC  flush_job_aes192_enc_sse
+#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_sse
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse
+#define FLUSH_JOB_AES256_ENC  flush_job_aes256_enc_sse
+#define SUBMIT_JOB_AES_ECB_128_ENC submit_job_aes_ecb_128_enc_sse
+#define SUBMIT_JOB_AES_ECB_128_DEC submit_job_aes_ecb_128_dec_sse
+#define SUBMIT_JOB_AES_ECB_192_ENC submit_job_aes_ecb_192_enc_sse
+#define SUBMIT_JOB_AES_ECB_192_DEC submit_job_aes_ecb_192_dec_sse
+#define SUBMIT_JOB_AES_ECB_256_ENC submit_job_aes_ecb_256_enc_sse
+#define SUBMIT_JOB_AES_ECB_256_DEC submit_job_aes_ecb_256_dec_sse
+#define SUBMIT_JOB_HMAC       submit_job_hmac_sse
+#define FLUSH_JOB_HMAC        flush_job_hmac_sse
+#define SUBMIT_JOB_HMAC_NI    submit_job_hmac_ni_sse
+#define FLUSH_JOB_HMAC_NI     flush_job_hmac_ni_sse
+#define SUBMIT_JOB_HMAC_SHA_224       submit_job_hmac_sha_224_sse
+#define FLUSH_JOB_HMAC_SHA_224        flush_job_hmac_sha_224_sse
+#define SUBMIT_JOB_HMAC_SHA_224_NI    submit_job_hmac_sha_224_ni_sse
+#define FLUSH_JOB_HMAC_SHA_224_NI     flush_job_hmac_sha_224_ni_sse
+#define SUBMIT_JOB_HMAC_SHA_256       submit_job_hmac_sha_256_sse
+#define FLUSH_JOB_HMAC_SHA_256        flush_job_hmac_sha_256_sse
+#define SUBMIT_JOB_HMAC_SHA_256_NI    submit_job_hmac_sha_256_ni_sse
+#define FLUSH_JOB_HMAC_SHA_256_NI     flush_job_hmac_sha_256_ni_sse
+#define SUBMIT_JOB_HMAC_SHA_384       submit_job_hmac_sha_384_sse
+#define FLUSH_JOB_HMAC_SHA_384        flush_job_hmac_sha_384_sse
+#define SUBMIT_JOB_HMAC_SHA_512       submit_job_hmac_sha_512_sse
+#define FLUSH_JOB_HMAC_SHA_512        flush_job_hmac_sha_512_sse
+#define SUBMIT_JOB_HMAC_MD5   submit_job_hmac_md5_sse
+#define FLUSH_JOB_HMAC_MD5    flush_job_hmac_md5_sse
+#define SUBMIT_JOB_AES_XCBC   submit_job_aes_xcbc_sse
+#define FLUSH_JOB_AES_XCBC    flush_job_aes_xcbc_sse
+
+#define SUBMIT_JOB_AES_CNTR   submit_job_aes_cntr_sse
+#define SUBMIT_JOB_AES_CNTR_BIT   submit_job_aes_cntr_bit_sse
+
+#define AES_CBC_DEC_128       aes_cbc_dec_128_sse
+#define AES_CBC_DEC_192       aes_cbc_dec_192_sse
+#define AES_CBC_DEC_256       aes_cbc_dec_256_sse
+
+#define AES_CNTR_128       aes_cntr_128_sse
+#define AES_CNTR_192       aes_cntr_192_sse
+#define AES_CNTR_256       aes_cntr_256_sse
+
+#define AES_CNTR_CCM_128   aes_cntr_ccm_128_sse
+
+#define AES_ECB_ENC_128       aes_ecb_enc_128_sse
+#define AES_ECB_ENC_192       aes_ecb_enc_192_sse
+#define AES_ECB_ENC_256       aes_ecb_enc_256_sse
+#define AES_ECB_DEC_128       aes_ecb_dec_128_sse
+#define AES_ECB_DEC_192       aes_ecb_dec_192_sse
+#define AES_ECB_DEC_256       aes_ecb_dec_256_sse
+
+#define SUBMIT_JOB_PON_ENC        submit_job_pon_enc_sse
+#define SUBMIT_JOB_PON_DEC        submit_job_pon_dec_sse
+#define SUBMIT_JOB_PON_ENC_NO_CTR submit_job_pon_enc_no_ctr_sse
+#define SUBMIT_JOB_PON_DEC_NO_CTR submit_job_pon_dec_no_ctr_sse
+
+#ifndef NO_GCM
+#define AES_GCM_DEC_128   aes_gcm_dec_128_sse
+#define AES_GCM_ENC_128   aes_gcm_enc_128_sse
+#define AES_GCM_DEC_192   aes_gcm_dec_192_sse
+#define AES_GCM_ENC_192   aes_gcm_enc_192_sse
+#define AES_GCM_DEC_256   aes_gcm_dec_256_sse
+#define AES_GCM_ENC_256   aes_gcm_enc_256_sse
+
+#define SUBMIT_JOB_AES_GCM_DEC submit_job_aes_gcm_dec_sse
+#define FLUSH_JOB_AES_GCM_DEC  flush_job_aes_gcm_dec_sse
+#define SUBMIT_JOB_AES_GCM_ENC submit_job_aes_gcm_enc_sse
+#define FLUSH_JOB_AES_GCM_ENC  flush_job_aes_gcm_enc_sse
+#endif /* NO_GCM */
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB         submit_job_sse
+#define FLUSH_JOB          flush_job_sse
+#define SUBMIT_JOB_NOCHECK submit_job_nocheck_sse
+#define GET_NEXT_JOB       get_next_job_sse
+#define GET_COMPLETED_JOB  get_completed_job_sse
+
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse
+#define QUEUE_SIZE queue_size_sse
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_SSE
+#define FLUSH_JOB_AES_ENC  FLUSH_JOB_AES_ENC_SSE
+#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_SSE
+#define SUBMIT_JOB_HASH    SUBMIT_JOB_HASH_SSE
+#define FLUSH_JOB_HASH     FLUSH_JOB_HASH_SSE
+
+/* ====================================================================== */
+
+#define AES_CFB_128_ONE    aes_cfb_128_one_sse
+
+void aes128_cbc_mac_x4(AES_ARGS *args, uint64_t len);
+
+#define AES128_CBC_MAC     aes128_cbc_mac_x4
+
+#define FLUSH_JOB_AES_CCM_AUTH     flush_job_aes_ccm_auth_sse
+#define SUBMIT_JOB_AES_CCM_AUTH    submit_job_aes_ccm_auth_sse
+
+#define FLUSH_JOB_AES_CMAC_AUTH    flush_job_aes_cmac_auth_sse
+#define SUBMIT_JOB_AES_CMAC_AUTH   submit_job_aes_cmac_auth_sse
+
+/* ====================================================================== */
+
+/*
+ * Used to decide if SHA1/SHA256 SIMD or SHA1NI OOO scheduler should be
+ * called.
+ */
+#define HASH_USE_SHAEXT 1
+
+
+/* ====================================================================== */
+
+/*
+ * GCM submit / flush API for SSE arch
+ */
+#ifndef NO_GCM
+static JOB_AES_HMAC *
+submit_job_aes_gcm_dec_sse(MB_MGR *state, JOB_AES_HMAC *job)
+{
+        DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+        (void) state;
+
+        if (16 == job->aes_key_len_in_bytes)
+                AES_GCM_DEC_128(job->aes_dec_key_expanded, &ctx, job->dst,
+                                job->src +
+                                job->cipher_start_src_offset_in_bytes,
+                                job->msg_len_to_cipher_in_bytes,
+                                job->iv,
+                                job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+                                job->auth_tag_output,
+                                job->auth_tag_output_len_in_bytes);
+        else if (24 == job->aes_key_len_in_bytes)
+                AES_GCM_DEC_192(job->aes_dec_key_expanded, &ctx, job->dst,
+                                job->src +
+                                job->cipher_start_src_offset_in_bytes,
+                                job->msg_len_to_cipher_in_bytes,
+                                job->iv,
+                                job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+                                job->auth_tag_output,
+                                job->auth_tag_output_len_in_bytes);
+        else /* assume 32 bytes */
+                AES_GCM_DEC_256(job->aes_dec_key_expanded, &ctx, job->dst,
+                                job->src +
+                                job->cipher_start_src_offset_in_bytes,
+                                job->msg_len_to_cipher_in_bytes,
+                                job->iv,
+                                job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+                                job->auth_tag_output,
+                                job->auth_tag_output_len_in_bytes);
+
+        job->status = STS_COMPLETED;
+        return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_dec_sse(MB_MGR *state, JOB_AES_HMAC *job)
+{
+        (void) state;
+        (void) job;
+        return NULL;
+}
+
+static JOB_AES_HMAC *
+submit_job_aes_gcm_enc_sse(MB_MGR *state, JOB_AES_HMAC *job)
+{
+        DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+        (void) state;
+
+        if (16 == job->aes_key_len_in_bytes)
+                AES_GCM_ENC_128(job->aes_enc_key_expanded, &ctx, job->dst,
+                                job->src +
+                                job->cipher_start_src_offset_in_bytes,
+                                job->msg_len_to_cipher_in_bytes, job->iv,
+                                job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+                                job->auth_tag_output,
+                                job->auth_tag_output_len_in_bytes);
+        else if (24 == job->aes_key_len_in_bytes)
+                AES_GCM_ENC_192(job->aes_enc_key_expanded, &ctx, job->dst,
+                                job->src +
+                                job->cipher_start_src_offset_in_bytes,
+                                job->msg_len_to_cipher_in_bytes, job->iv,
+                                job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+                                job->auth_tag_output,
+                                job->auth_tag_output_len_in_bytes);
+        else /* assume 32 bytes */
+                AES_GCM_ENC_256(job->aes_enc_key_expanded, &ctx, job->dst,
+                                job->src +
+                                job->cipher_start_src_offset_in_bytes,
+                                job->msg_len_to_cipher_in_bytes, job->iv,
+                                job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+                                job->auth_tag_output,
+                                job->auth_tag_output_len_in_bytes);
+
+        job->status = STS_COMPLETED;
+        return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_enc_sse(MB_MGR *state, JOB_AES_HMAC *job)
+{
+        (void) state;
+        (void) job;
+        return NULL;
+}
+#endif /* NO_GCM */
+
+IMB_DLL_LOCAL JOB_AES_HMAC *
+submit_job_aes_cntr_sse(JOB_AES_HMAC *job)
+{
+        if (16 == job->aes_key_len_in_bytes)
+                AES_CNTR_128(job->src + job->cipher_start_src_offset_in_bytes,
+                             job->iv,
+                             job->aes_enc_key_expanded,
+                             job->dst,
+                             job->msg_len_to_cipher_in_bytes,
+                             job->iv_len_in_bytes);
+        else if (24 == job->aes_key_len_in_bytes)
+                AES_CNTR_192(job->src + job->cipher_start_src_offset_in_bytes,
+                             job->iv,
+                             job->aes_enc_key_expanded,
+                             job->dst,
+                             job->msg_len_to_cipher_in_bytes,
+                             job->iv_len_in_bytes);
+        else /* assume 32 bytes */
+                AES_CNTR_256(job->src + job->cipher_start_src_offset_in_bytes,
+                             job->iv,
+                             job->aes_enc_key_expanded,
+                             job->dst,
+                             job->msg_len_to_cipher_in_bytes,
+                             job->iv_len_in_bytes);
+
+        job->status |= STS_COMPLETED_AES;
+        return job;
+}
+
+IMB_DLL_LOCAL JOB_AES_HMAC *
+submit_job_aes_cntr_bit_sse(JOB_AES_HMAC *job)
+{
+        if (16 == job->aes_key_len_in_bytes)
+                aes_cntr_bit_128_sse(job->src +
+                                     job->cipher_start_src_offset_in_bytes,
+                                     job->iv,
+                                     job->aes_enc_key_expanded,
+                                     job->dst,
+                                     job->msg_len_to_cipher_in_bits,
+                                     job->iv_len_in_bytes);
+        else if (24 == job->aes_key_len_in_bytes)
+                aes_cntr_bit_192_sse(job->src +
+                                     job->cipher_start_src_offset_in_bytes,
+                                     job->iv,
+                                     job->aes_enc_key_expanded,
+                                     job->dst,
+                                     job->msg_len_to_cipher_in_bits,
+                                     job->iv_len_in_bytes);
+        else /* assume 32 bytes */
+                aes_cntr_bit_256_sse(job->src +
+                                     job->cipher_start_src_offset_in_bytes,
+                                     job->iv,
+                                     job->aes_enc_key_expanded,
+                                     job->dst,
+                                     job->msg_len_to_cipher_in_bits,
+                                     job->iv_len_in_bytes);
+
+        job->status |= STS_COMPLETED_AES;
+        return job;
+}
+
+/* ====================================================================== */
+
+void
+init_mb_mgr_sse(MB_MGR *state)
+{
+        unsigned int j;
+        uint8_t *p;
+        size_t size;
+
+        state->features = cpu_feature_adjust(state->flags,
+                                             cpu_feature_detect());
+
+        if (!(state->features & IMB_FEATURE_AESNI)) {
+                init_mb_mgr_sse_no_aesni(state);
+                return;
+        }
+
+        /* Init AES out-of-order fields */
+        memset(state->aes128_ooo.lens, 0xFF,
+               sizeof(state->aes128_ooo.lens));
+        memset(&state->aes128_ooo.lens[0], 0,
+               sizeof(state->aes128_ooo.lens[0]) * 4);
+        memset(state->aes128_ooo.job_in_lane, 0,
+               sizeof(state->aes128_ooo.job_in_lane));
+        state->aes128_ooo.unused_lanes = 0xFF03020100;
+        state->aes128_ooo.num_lanes_inuse = 0;
+
+
+        memset(state->aes192_ooo.lens, 0xFF,
+               sizeof(state->aes192_ooo.lens));
+        memset(&state->aes192_ooo.lens[0], 0,
+               sizeof(state->aes192_ooo.lens[0]) * 4);
+        memset(state->aes192_ooo.job_in_lane, 0,
+               sizeof(state->aes192_ooo.job_in_lane));
+        state->aes192_ooo.unused_lanes = 0xFF03020100;
+        state->aes192_ooo.num_lanes_inuse = 0;
+
+
+        memset(state->aes256_ooo.lens, 0xFF,
+               sizeof(state->aes256_ooo.lens));
+        memset(&state->aes256_ooo.lens[0], 0,
+               sizeof(state->aes256_ooo.lens[0]) * 4);
+        memset(state->aes256_ooo.job_in_lane, 0,
+               sizeof(state->aes256_ooo.job_in_lane));
+        state->aes256_ooo.unused_lanes = 0xFF03020100;
+        state->aes256_ooo.num_lanes_inuse = 0;
+
+
+        /* DOCSIS SEC BPI uses same settings as AES128 CBC */
+        memset(state->docsis_sec_ooo.lens, 0xFF,
+               sizeof(state->docsis_sec_ooo.lens));
+        memset(&state->docsis_sec_ooo.lens[0], 0,
+               sizeof(state->docsis_sec_ooo.lens[0]) * 4);
+        memset(state->docsis_sec_ooo.job_in_lane, 0,
+               sizeof(state->docsis_sec_ooo.job_in_lane));
+        state->docsis_sec_ooo.unused_lanes = 0xFF03020100;
+        state->docsis_sec_ooo.num_lanes_inuse = 0;
+
+
+        /* Init HMAC/SHA1 out-of-order fields */
+        state->hmac_sha_1_ooo.lens[0] = 0;
+        state->hmac_sha_1_ooo.lens[1] = 0;
+        state->hmac_sha_1_ooo.lens[2] = 0;
+        state->hmac_sha_1_ooo.lens[3] = 0;
+        state->hmac_sha_1_ooo.lens[4] = 0xFFFF;
+        state->hmac_sha_1_ooo.lens[5] = 0xFFFF;
+        state->hmac_sha_1_ooo.lens[6] = 0xFFFF;
+        state->hmac_sha_1_ooo.lens[7] = 0xFFFF;
+        state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100;
+        for (j = 0; j < SSE_NUM_SHA1_LANES; j++) {
+                state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL;
+                state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80;
+                memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65,
+                       0x00,
+                       64+7);
+                p = state->hmac_sha_1_ooo.ldata[j].outer_block;
+                memset(p + 5*4 + 1,
+                       0x00,
+                       64 - 5*4 - 1 - 2);
+                p[5*4] = 0x80;
+                p[64-2] = 0x02;
+                p[64-1] = 0xA0;
+        }
+
+#ifdef HASH_USE_SHAEXT
+        if (state->features & IMB_FEATURE_SHANI) {
+                /* Init HMAC/SHA1 NI out-of-order fields */
+                state->hmac_sha_1_ooo.lens[0] = 0;
+                state->hmac_sha_1_ooo.lens[1] = 0;
+                state->hmac_sha_1_ooo.lens[2] = 0xFFFF;
+                state->hmac_sha_1_ooo.lens[3] = 0xFFFF;
+                state->hmac_sha_1_ooo.lens[4] = 0xFFFF;
+                state->hmac_sha_1_ooo.lens[5] = 0xFFFF;
+                state->hmac_sha_1_ooo.lens[6] = 0xFFFF;
+                state->hmac_sha_1_ooo.lens[7] = 0xFFFF;
+                state->hmac_sha_1_ooo.unused_lanes = 0xFF0100;
+        }
+#endif /* HASH_USE_SHAEXT */
+
+        /* Init HMAC/SHA224 out-of-order fields */
+        state->hmac_sha_224_ooo.lens[0] = 0;
+        state->hmac_sha_224_ooo.lens[1] = 0;
+        state->hmac_sha_224_ooo.lens[2] = 0;
+        state->hmac_sha_224_ooo.lens[3] = 0;
+        state->hmac_sha_224_ooo.lens[4] = 0xFFFF;
+        state->hmac_sha_224_ooo.lens[5] = 0xFFFF;
+        state->hmac_sha_224_ooo.lens[6] = 0xFFFF;
+        state->hmac_sha_224_ooo.lens[7] = 0xFFFF;
+        state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100;
+        for (j = 0; j < SSE_NUM_SHA256_LANES; j++) {
+                state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL;
+
+                p = state->hmac_sha_224_ooo.ldata[j].extra_block;
+                size = sizeof(state->hmac_sha_224_ooo.ldata[j].extra_block);
+                memset (p, 0x00, size);
+                p[64] = 0x80;
+
+                p = state->hmac_sha_224_ooo.ldata[j].outer_block;
+                size = sizeof(state->hmac_sha_224_ooo.ldata[j].outer_block);
+                memset(p, 0x00, size);
+                p[7*4] = 0x80;  /* digest 7 words long */
+                p[64-2] = 0x02; /* length in little endian = 0x02E0 */
+                p[64-1] = 0xE0;
+        }
+#ifdef HASH_USE_SHAEXT
+        if (state->features & IMB_FEATURE_SHANI) {
+                /* Init HMAC/SHA224 NI out-of-order fields */
+                state->hmac_sha_224_ooo.lens[0] = 0;
+                state->hmac_sha_224_ooo.lens[1] = 0;
+                state->hmac_sha_224_ooo.lens[2] = 0xFFFF;
+                state->hmac_sha_224_ooo.lens[3] = 0xFFFF;
+                state->hmac_sha_224_ooo.lens[4] = 0xFFFF;
+                state->hmac_sha_224_ooo.lens[5] = 0xFFFF;
+                state->hmac_sha_224_ooo.lens[6] = 0xFFFF;
+                state->hmac_sha_224_ooo.lens[7] = 0xFFFF;
+                state->hmac_sha_224_ooo.unused_lanes = 0xFF0100;
+        }
+#endif /* HASH_USE_SHAEXT */
+
+        /* Init HMAC/SHA_256 out-of-order fields */
+        state->hmac_sha_256_ooo.lens[0] = 0;
+        state->hmac_sha_256_ooo.lens[1] = 0;
+        state->hmac_sha_256_ooo.lens[2] = 0;
+        state->hmac_sha_256_ooo.lens[3] = 0;
+        state->hmac_sha_256_ooo.lens[4] = 0xFFFF;
+        state->hmac_sha_256_ooo.lens[5] = 0xFFFF;
+        state->hmac_sha_256_ooo.lens[6] = 0xFFFF;
+        state->hmac_sha_256_ooo.lens[7] = 0xFFFF;
+        state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100;
+        for (j = 0; j < SSE_NUM_SHA256_LANES; j++) {
+                state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL;
+                state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80;
+                memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65,
+                       0x00,
+                       64+7);
+                p = state->hmac_sha_256_ooo.ldata[j].outer_block;
+                memset(p + 8*4 + 1,
+                       0x00,
+                       64 - 8*4 - 1 - 2); /* digest is 8*4 bytes long */
+                p[8*4] = 0x80;
+                p[64-2] = 0x03; /* length of (opad (64*8) bits + 256 bits)
+                                 * in hex is 0x300 */
+                p[64-1] = 0x00;
+        }
+#ifdef HASH_USE_SHAEXT
+        if (state->features & IMB_FEATURE_SHANI) {
+                /* Init HMAC/SHA256 NI out-of-order fields */
+                state->hmac_sha_256_ooo.lens[0] = 0;
+                state->hmac_sha_256_ooo.lens[1] = 0;
+                state->hmac_sha_256_ooo.lens[2] = 0xFFFF;
+                state->hmac_sha_256_ooo.lens[3] = 0xFFFF;
+                state->hmac_sha_256_ooo.lens[4] = 0xFFFF;
+                state->hmac_sha_256_ooo.lens[5] = 0xFFFF;
+                state->hmac_sha_256_ooo.lens[6] = 0xFFFF;
+                state->hmac_sha_256_ooo.lens[7] = 0xFFFF;
+                state->hmac_sha_256_ooo.unused_lanes = 0xFF0100;
+        }
+#endif /* HASH_USE_SHAEXT */
+
+        /* Init HMAC/SHA384 out-of-order fields */
+        state->hmac_sha_384_ooo.lens[0] = 0;
+        state->hmac_sha_384_ooo.lens[1] = 0;
+        state->hmac_sha_384_ooo.lens[2] = 0xFFFF;
+        state->hmac_sha_384_ooo.lens[3] = 0xFFFF;
+        state->hmac_sha_384_ooo.lens[4] = 0xFFFF;
+        state->hmac_sha_384_ooo.lens[5] = 0xFFFF;
+        state->hmac_sha_384_ooo.lens[6] = 0xFFFF;
+        state->hmac_sha_384_ooo.lens[7] = 0xFFFF;
+        state->hmac_sha_384_ooo.unused_lanes = 0xFF0100;
+        for (j = 0; j < SSE_NUM_SHA512_LANES; j++) {
+                MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo;
+
+                ctx->ldata[j].job_in_lane = NULL;
+                ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80;
+                memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1),
+                       0x00, SHA_384_BLOCK_SIZE + 7);
+
+                p = ctx->ldata[j].outer_block;
+                memset(p + SHA384_DIGEST_SIZE_IN_BYTES  + 1, 0x00,
+                       /* special end point because this length is constant */
+                       SHA_384_BLOCK_SIZE -
+                       SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2);
+                p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */
+                /*
+                 * hmac outer block length always of fixed size, it is OKey
+                 * length, a whole message block length, 1024 bits, with padding
+                 * plus the length of the inner digest, which is 384 bits
+                 * 1408 bits == 0x0580. The input message block needs to be
+                 * converted to big endian within the sha implementation
+                 * before use.
+                 */
+                p[SHA_384_BLOCK_SIZE - 2] = 0x05;
+                p[SHA_384_BLOCK_SIZE - 1] = 0x80;
+        }
+
+        /* Init HMAC/SHA512 out-of-order fields */
+        state->hmac_sha_512_ooo.lens[0] = 0;
+        state->hmac_sha_512_ooo.lens[1] = 0;
+        state->hmac_sha_512_ooo.lens[2] = 0xFFFF;
+        state->hmac_sha_512_ooo.lens[3] = 0xFFFF;
+        state->hmac_sha_512_ooo.lens[4] = 0xFFFF;
+        state->hmac_sha_512_ooo.lens[5] = 0xFFFF;
+        state->hmac_sha_512_ooo.lens[6] = 0xFFFF;
+        state->hmac_sha_512_ooo.lens[7] = 0xFFFF;
+        state->hmac_sha_512_ooo.unused_lanes = 0xFF0100;
+        for (j = 0; j < SSE_NUM_SHA512_LANES; j++) {
+                MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo;
+
+                ctx->ldata[j].job_in_lane = NULL;
+                ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80;
+                memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1),
+                       0x00, SHA_512_BLOCK_SIZE + 7);
+
+                p = ctx->ldata[j].outer_block;
+                memset(p + SHA512_DIGEST_SIZE_IN_BYTES  + 1, 0x00,
+                       /* special end point because this length is constant */
+                       SHA_512_BLOCK_SIZE -
+                       SHA512_DIGEST_SIZE_IN_BYTES  - 1 - 2);
+                p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */
+                /*
+                 * hmac outer block length always of fixed size, it is OKey
+                 * length, a whole message block length, 1024 bits, with padding
+                 * plus the length of the inner digest, which is 512 bits
+                 * 1536 bits == 0x600. The input message block needs to be
+                 * converted to big endian within the sha implementation
+                 * before use.
+                 */
+                p[SHA_512_BLOCK_SIZE - 2] = 0x06;
+                p[SHA_512_BLOCK_SIZE - 1] = 0x00;
+        }
+
+        /* Init HMAC/MD5 out-of-order fields */
+        state->hmac_md5_ooo.lens[0] = 0;
+        state->hmac_md5_ooo.lens[1] = 0;
+        state->hmac_md5_ooo.lens[2] = 0;
+        state->hmac_md5_ooo.lens[3] = 0;
+        state->hmac_md5_ooo.lens[4] = 0;
+        state->hmac_md5_ooo.lens[5] = 0;
+        state->hmac_md5_ooo.lens[6] = 0;
+        state->hmac_md5_ooo.lens[7] = 0;
+        state->hmac_md5_ooo.lens[8] = 0xFFFF;
+        state->hmac_md5_ooo.lens[9] = 0xFFFF;
+        state->hmac_md5_ooo.lens[10] = 0xFFFF;
+        state->hmac_md5_ooo.lens[11] = 0xFFFF;
+        state->hmac_md5_ooo.lens[12] = 0xFFFF;
+        state->hmac_md5_ooo.lens[13] = 0xFFFF;
+        state->hmac_md5_ooo.lens[14] = 0xFFFF;
+        state->hmac_md5_ooo.lens[15] = 0xFFFF;
+        state->hmac_md5_ooo.unused_lanes = 0xF76543210;
+        for (j = 0; j < SSE_NUM_MD5_LANES; j++) {
+                state->hmac_md5_ooo.ldata[j].job_in_lane = NULL;
+
+                p = state->hmac_md5_ooo.ldata[j].extra_block;
+                size = sizeof(state->hmac_md5_ooo.ldata[j].extra_block);
+                memset (p, 0x00, size);
+                p[64] = 0x80;
+
+                p = state->hmac_md5_ooo.ldata[j].outer_block;
+                size = sizeof(state->hmac_md5_ooo.ldata[j].outer_block);
+                memset(p, 0x00, size);
+                p[4*4] = 0x80;
+                p[64-7] = 0x02;
+                p[64-8] = 0x80;
+        }
+
+        /* Init AES/XCBC OOO fields */
+        state->aes_xcbc_ooo.lens[0] = 0;
+        state->aes_xcbc_ooo.lens[1] = 0;
+        state->aes_xcbc_ooo.lens[2] = 0;
+        state->aes_xcbc_ooo.lens[3] = 0;
+        state->aes_xcbc_ooo.lens[4] = 0xFFFF;
+        state->aes_xcbc_ooo.lens[5] = 0xFFFF;
+        state->aes_xcbc_ooo.lens[6] = 0xFFFF;
+        state->aes_xcbc_ooo.lens[7] = 0xFFFF;
+        state->aes_xcbc_ooo.unused_lanes = 0xFF03020100;
+        for (j = 0; j < 4; j++) {
+                state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL;
+                state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80;
+                memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15);
+        }
+
+        /* Init AES-CCM auth out-of-order fields */
+        for (j = 0; j < 4; j++) {
+                state->aes_ccm_ooo.init_done[j] = 0;
+                state->aes_ccm_ooo.lens[j] = 0;
+                state->aes_ccm_ooo.job_in_lane[j] = NULL;
+        }
+        for (; j < 8; j++)
+                state->aes_ccm_ooo.lens[j] = 0xFFFF;
+
+        state->aes_ccm_ooo.unused_lanes = 0xF3210;
+
+        /* Init AES-CMAC auth out-of-order fields */
+        state->aes_cmac_ooo.lens[0] = 0;
+        state->aes_cmac_ooo.lens[1] = 0;
+        state->aes_cmac_ooo.lens[2] = 0;
+        state->aes_cmac_ooo.lens[3] = 0;
+        state->aes_cmac_ooo.lens[4] = 0xFFFF;
+        state->aes_cmac_ooo.lens[5] = 0xFFFF;
+        state->aes_cmac_ooo.lens[6] = 0xFFFF;
+        state->aes_cmac_ooo.lens[7] = 0xFFFF;
+        for (j = 0; j < 4; j++) {
+                state->aes_cmac_ooo.init_done[j] = 0;
+                state->aes_cmac_ooo.job_in_lane[j] = NULL;
+        }
+        state->aes_cmac_ooo.unused_lanes = 0xF3210;
+
+        /* Init "in order" components */
+        state->next_job = 0;
+        state->earliest_job = -1;
+
+        /* set SSE handlers */
+        state->get_next_job        = get_next_job_sse;
+        state->submit_job          = submit_job_sse;
+        state->submit_job_nocheck  = submit_job_nocheck_sse;
+        state->get_completed_job   = get_completed_job_sse;
+        state->flush_job           = flush_job_sse;
+        state->queue_size          = queue_size_sse;
+        state->keyexp_128          = aes_keyexp_128_sse;
+        state->keyexp_192          = aes_keyexp_192_sse;
+        state->keyexp_256          = aes_keyexp_256_sse;
+        state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_sse;
+        state->xcbc_keyexp         = aes_xcbc_expand_key_sse;
+        state->des_key_sched       = des_key_schedule;
+        state->sha1_one_block      = sha1_one_block_sse;
+        state->sha1                = sha1_sse;
+        state->sha224_one_block    = sha224_one_block_sse;
+        state->sha224              = sha224_sse;
+        state->sha256_one_block    = sha256_one_block_sse;
+        state->sha256              = sha256_sse;
+        state->sha384_one_block    = sha384_one_block_sse;
+        state->sha384              = sha384_sse;
+        state->sha512_one_block    = sha512_one_block_sse;
+        state->sha512              = sha512_sse;
+        state->md5_one_block       = md5_one_block_sse;
+        state->aes128_cfb_one      = aes_cfb_128_one_sse;
+
+        state->eea3_1_buffer       = zuc_eea3_1_buffer_sse;
+        state->eea3_4_buffer       = zuc_eea3_4_buffer_sse;
+        state->eea3_n_buffer       = zuc_eea3_n_buffer_sse;
+        state->eia3_1_buffer       = zuc_eia3_1_buffer_sse;
+
+        state->f8_1_buffer         = kasumi_f8_1_buffer_sse;
+        state->f8_1_buffer_bit     = kasumi_f8_1_buffer_bit_sse;
+        state->f8_2_buffer         = kasumi_f8_2_buffer_sse;
+        state->f8_3_buffer         = kasumi_f8_3_buffer_sse;
+        state->f8_4_buffer         = kasumi_f8_4_buffer_sse;
+        state->f8_n_buffer         = kasumi_f8_n_buffer_sse;
+        state->f9_1_buffer         = kasumi_f9_1_buffer_sse;
+        state->f9_1_buffer_user    = kasumi_f9_1_buffer_user_sse;
+        state->kasumi_init_f8_key_sched = kasumi_init_f8_key_sched_sse;
+        state->kasumi_init_f9_key_sched = kasumi_init_f9_key_sched_sse;
+        state->kasumi_key_sched_size = kasumi_key_sched_size_sse;
+
+        state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_sse;
+        state->snow3g_f8_1_buffer  = snow3g_f8_1_buffer_sse;
+        state->snow3g_f8_2_buffer  = snow3g_f8_2_buffer_sse;
+        state->snow3g_f8_4_buffer  = snow3g_f8_4_buffer_sse;
+        state->snow3g_f8_8_buffer  = snow3g_f8_8_buffer_sse;
+        state->snow3g_f8_n_buffer  = snow3g_f8_n_buffer_sse;
+        state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_sse;
+        state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_sse;
+        state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_sse;
+        state->snow3g_init_key_sched = snow3g_init_key_sched_sse;
+        state->snow3g_key_sched_size = snow3g_key_sched_size_sse;
+
+#ifndef NO_GCM
+        state->gcm128_enc          = aes_gcm_enc_128_sse;
+        state->gcm192_enc          = aes_gcm_enc_192_sse;
+        state->gcm256_enc          = aes_gcm_enc_256_sse;
+        state->gcm128_dec          = aes_gcm_dec_128_sse;
+        state->gcm192_dec          = aes_gcm_dec_192_sse;
+        state->gcm256_dec          = aes_gcm_dec_256_sse;
+        state->gcm128_init         = aes_gcm_init_128_sse;
+        state->gcm192_init         = aes_gcm_init_192_sse;
+        state->gcm256_init         = aes_gcm_init_256_sse;
+        state->gcm128_enc_update   = aes_gcm_enc_128_update_sse;
+        state->gcm192_enc_update   = aes_gcm_enc_192_update_sse;
+        state->gcm256_enc_update   = aes_gcm_enc_256_update_sse;
+        state->gcm128_dec_update   = aes_gcm_dec_128_update_sse;
+        state->gcm192_dec_update   = aes_gcm_dec_192_update_sse;
+        state->gcm256_dec_update   = aes_gcm_dec_256_update_sse;
+        state->gcm128_enc_finalize = aes_gcm_enc_128_finalize_sse;
+        state->gcm192_enc_finalize = aes_gcm_enc_192_finalize_sse;
+        state->gcm256_enc_finalize = aes_gcm_enc_256_finalize_sse;
+        state->gcm128_dec_finalize = aes_gcm_dec_128_finalize_sse;
+        state->gcm192_dec_finalize = aes_gcm_dec_192_finalize_sse;
+        state->gcm256_dec_finalize = aes_gcm_dec_256_finalize_sse;
+        state->gcm128_precomp      = aes_gcm_precomp_128_sse;
+        state->gcm192_precomp      = aes_gcm_precomp_192_sse;
+        state->gcm256_precomp      = aes_gcm_precomp_256_sse;
+        state->gcm128_pre          = aes_gcm_pre_128_sse;
+        state->gcm192_pre          = aes_gcm_pre_192_sse;
+        state->gcm256_pre          = aes_gcm_pre_256_sse;
+#endif
+}
+
+#include "mb_mgr_code.h"
diff --git a/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm
new file mode 100644
index 000000000..581e3fade
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm
@@ -0,0 +1,787 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute octal MD5 using SSE
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers:  rax rbx     rdx rsi rdi     r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp
+;;
+;; Linux clobbers:    rax rbx rcx rdx rsi         r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:                       rdi rbp
+;;
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+section .data align=64
+default rel
+
+align 64
+MKGLOBAL(MD5_TABLE,data,internal)
+MD5_TABLE:
+        dd      0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+        dd      0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+        dd      0x242070db, 0x242070db, 0x242070db, 0x242070db
+        dd      0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+        dd      0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+        dd      0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+        dd      0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+        dd      0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+        dd      0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+        dd      0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+        dd      0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+        dd      0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+        dd      0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+        dd      0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+        dd      0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+        dd      0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+        dd      0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+        dd      0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+        dd      0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+        dd      0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+        dd      0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+        dd      0x02441453, 0x02441453, 0x02441453, 0x02441453
+        dd      0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+        dd      0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+        dd      0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+        dd      0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+        dd      0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+        dd      0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+        dd      0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+        dd      0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+        dd      0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+        dd      0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+        dd      0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+        dd      0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+        dd      0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+        dd      0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+        dd      0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+        dd      0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+        dd      0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+        dd      0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+        dd      0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+        dd      0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+        dd      0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+        dd      0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+        dd      0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+        dd      0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+        dd      0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+        dd      0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+        dd      0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+        dd      0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+        dd      0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+        dd      0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+        dd      0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+        dd      0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+        dd      0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+        dd      0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+        dd      0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+        dd      0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+        dd      0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+        dd      0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+        dd      0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+        dd      0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+        dd      0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+        dd      0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+
+ONES:
+        dd      0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+
+section .text
+
+%ifdef LINUX
+;; Linux Registers
+%define arg1	rdi
+%define arg2	rsi
+%define mem1    rcx
+%define mem2    rdx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define mem1    rdi
+%define mem2    rsi
+%endif
+
+;; rbp is not clobbered
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+%define TBL   rax
+%define IDX   rbx
+
+%define A       xmm0
+%define B       xmm1
+%define C       xmm2
+%define D       xmm3
+%define E       xmm4 ; tmp
+%define F       xmm5 ; tmp
+
+%define A2      xmm6
+%define B2      xmm7
+%define C2      xmm8
+%define D2      xmm9
+
+
+%define FUN     E
+%define TMP     F
+%define FUN2    xmm10
+%define TMP2    xmm11
+
+%define T0      xmm10
+%define T1      xmm11
+%define T2      xmm12
+%define T3      xmm13
+%define T4      xmm14
+%define T5      xmm15
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4   \
+; ...                              \
+; 300 data2[0]  for lanes 7...4     \
+; 2F0 data2[15] for lanes 3...0      > mem block 2
+; ...                               /
+; 210 data2[1]  for lanes 3...0    /
+; 200 data2[0]  for lanes 3...0   /
+;
+; 1F0 data1[15] for lanes 7...4   \
+; ...                              \
+; 100 data1[0]  for lanes 7...4     \
+;  F0 data1[15] for lanes 3...0      > mem block 1
+; ...                               /
+;  10 data1[1]  for lanes 3...0    /
+;   0 data1[0]  for lanes 3...0   /
+
+; stack size must be an odd multiple of 8 bytes in size
+struc STACK
+_DATA:		reso	2*2*16	; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST:	reso	8	; stores AA-DD, AA2-DD2
+		resb	8	; for alignment
+endstruc
+%define STACK_SIZE STACK_size
+
+%define AA      rsp + _DIGEST + 16*0
+%define BB      rsp + _DIGEST + 16*1
+%define CC      rsp + _DIGEST + 16*2
+%define DD      rsp + _DIGEST + 16*3
+%define AA2     rsp + _DIGEST + 16*4
+%define BB2     rsp + _DIGEST + 16*5
+%define CC2     rsp + _DIGEST + 16*6
+%define DD2     rsp + _DIGEST + 16*7
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ  7
+rot12 equ  12
+rot13 equ  17
+rot14 equ  22
+rot21 equ  5
+rot22 equ  9
+rot23 equ  14
+rot24 equ  20
+rot31 equ  4
+rot32 equ  11
+rot33 equ  16
+rot34 equ  23
+rot41 equ  6
+rot42 equ  10
+rot43 equ  15
+rot44 equ  21
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+        movdqa  %%t0, %%r0
+        shufps  %%t0, %%r1, 0x44        ; t0 = {b1 b0 a1 a0}
+        shufps  %%r0, %%r1, 0xEE        ; r0 = {b3 b2 a3 a2}
+
+        movdqa  %%t1, %%r2
+        shufps  %%t1, %%r3, 0x44        ; t1 = {d1 d0 c1 c0}
+        shufps  %%r2, %%r3, 0xEE        ; r2 = {d3 d2 c3 c2}
+
+        movdqa  %%r1, %%t0
+        shufps  %%r1, %%t1, 0xDD        ; r1 = {d1 c1 b1 a1}
+
+        movdqa  %%r3, %%r0
+        shufps  %%r3, %%r2, 0xDD        ; r3 = {d3 c3 b3 a3}
+
+        shufps  %%r0, %%r2, 0x88        ; r0 = {d2 c2 b2 a2}
+        shufps  %%t0, %%t1, 0x88        ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z   ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+        movdqa  %%F,%%Z
+        pxor    %%F,%%Y
+        pand    %%F,%%X
+        pxor    %%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z   ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+        MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z   ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+        movdqa  %%F,%%Z
+        pxor    %%F,%%Y
+        pxor    %%F,%%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z   ;; F =  ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+        movdqa  %%F,%%Z
+        pxor    %%F,[rel ONES]  ; pnot     %%F
+        por     %%F,%%X
+        pxor    %%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+        movdqa  %%tmp, %%reg
+        psrld   %%tmp, (32-%%imm)
+        pslld   %%reg, %%imm
+        por     %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN     %1
+%define %%A             %2
+%define %%B             %3
+%define %%C             %4
+%define %%D             %5
+%define %%A2            %6
+%define %%B2            %7
+%define %%C2            %8
+%define %%D2            %9
+%define %%FUN           %10
+%define %%TMP           %11
+%define %%data          %12
+%define %%MD5const      %13
+%define %%nrot          %14
+
+        paddd       %%A, %%MD5const
+                paddd       %%A2, %%MD5const
+        paddd       %%A, [%%data]
+                paddd       %%A2, [%%data + 16*16]
+        %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+        paddd       %%A, %%FUN
+                %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+                paddd       %%A2, %%FUN
+        PROLD       %%A,%%nrot, %%TMP
+                PROLD       %%A2,%%nrot, %%TMP
+        paddd       %%A, %%B
+                paddd       %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+;                MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN     %1
+%define %%A             %2
+%define %%B             %3
+%define %%C             %4
+%define %%D             %5
+%define %%A2            %6
+%define %%B2            %7
+%define %%C2            %8
+%define %%D2            %9
+%define %%FUN           %10
+%define %%TMP           %11
+%define %%FUN2          %12
+%define %%TMP2          %13
+%define %%data          %14
+%define %%MD5const      %15
+%define %%nrot          %16
+
+        paddd       %%A, %%MD5const
+                paddd       %%A2, %%MD5const
+        paddd       %%A, [%%data]
+                paddd       %%A2, [%%data + 16*16]
+        %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+                %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+        paddd       %%A, %%FUN
+                paddd       %%A2, %%FUN2
+        PROLD       %%A,%%nrot, %%TMP
+                PROLD       %%A2,%%nrot, %%TMP2
+        paddd       %%A, %%B
+                paddd       %%A2, %%B2
+%endmacro
+
+; void md5_x4x2_sse(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+;
+align 32
+MKGLOBAL(md5_x4x2_sse,function,internal)
+md5_x4x2_sse:
+
+        sub     rsp, STACK_SIZE
+
+        ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2
+        ;; Initialize digests
+        movdqa  A,[arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE]
+        movdqa  B,[arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE]
+        movdqa  C,[arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE]
+        movdqa  D,[arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE]
+
+                ;; Initialize digests
+                movdqa  A2,[arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE]
+                movdqa  B2,[arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE]
+                movdqa  C2,[arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE]
+                movdqa  D2,[arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE]
+
+        lea     TBL, [rel MD5_TABLE]
+
+        ;; load input pointers
+        mov     inp0,[arg1+_data_ptr_md5  +0*PTR_SZ]
+        mov     inp1,[arg1+_data_ptr_md5  +1*PTR_SZ]
+        mov     inp2,[arg1+_data_ptr_md5  +2*PTR_SZ]
+        mov     inp3,[arg1+_data_ptr_md5  +3*PTR_SZ]
+                mov     inp4,[arg1+_data_ptr_md5  +4*PTR_SZ]
+                mov     inp5,[arg1+_data_ptr_md5  +5*PTR_SZ]
+                mov     inp6,[arg1+_data_ptr_md5  +6*PTR_SZ]
+                mov     inp7,[arg1+_data_ptr_md5  +7*PTR_SZ]
+        xor     IDX, IDX
+
+        ; Make ping-pong pointers to the two memory blocks
+        mov     mem1, rsp
+        lea     mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem1+(I*4+0)*16],T0
+        movdqa  [mem1+(I*4+1)*16],T1
+        movdqa  [mem1+(I*4+2)*16],T2
+        movdqa  [mem1+(I*4+3)*16],T3
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem1+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem1+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem1+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+        ; save old digests
+        movdqa  [AA], A
+        movdqa  [BB], B
+        movdqa  [CC], C
+        movdqa  [DD], D
+                ; save old digests
+                movdqa  [AA2], A2
+                movdqa  [BB2], B2
+                movdqa  [CC2], C2
+                movdqa  [DD2], D2
+
+        add     IDX, 4*16
+        sub     arg2, 1
+        je      lastblock
+
+        MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
+        MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
+        MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
+        MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
+        MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
+        MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
+        MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
+        MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16],T0
+        movdqa  [mem2+(I*4+1)*16],T1
+        movdqa  [mem2+(I*4+2)*16],T2
+        movdqa  [mem2+(I*4+3)*16],T3
+
+        MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
+        MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
+        MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
+        MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
+        MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
+        MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
+        MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
+        MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
+
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem2+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem2+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+        MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
+        MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
+        MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
+        MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
+        MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
+        MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
+        MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
+        MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
+
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16],T0
+        movdqa  [mem2+(I*4+1)*16],T1
+        movdqa  [mem2+(I*4+2)*16],T2
+        movdqa  [mem2+(I*4+3)*16],T3
+
+        MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
+        MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
+        MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
+        MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
+        MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
+        MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
+        MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
+        MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem2+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem2+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+        MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
+        MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
+        MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
+        MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
+        MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
+        MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
+        MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
+        MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
+
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16],T0
+        movdqa  [mem2+(I*4+1)*16],T1
+        movdqa  [mem2+(I*4+2)*16],T2
+        movdqa  [mem2+(I*4+3)*16],T3
+
+        MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
+        MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
+        MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
+        MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
+        MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
+        MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
+        MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
+        MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem2+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem2+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+        MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
+        MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
+        MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
+        MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
+        MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
+        MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
+        MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
+        MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
+
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16],T0
+        movdqa  [mem2+(I*4+1)*16],T1
+        movdqa  [mem2+(I*4+2)*16],T2
+        movdqa  [mem2+(I*4+3)*16],T3
+
+        MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
+        MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
+        MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
+        MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
+        MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
+        MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
+        MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
+        MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem2+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem2+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+        paddd   A,[AA]
+        paddd   B,[BB]
+        paddd   C,[CC]
+        paddd   D,[DD]
+
+                paddd   A2,[AA2]
+                paddd   B2,[BB2]
+                paddd   C2,[CC2]
+                paddd   D2,[DD2]
+
+        ; swap mem1 and mem2
+        xchg    mem1, mem2
+
+        jmp     lloop
+
+lastblock:
+
+        MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
+        MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
+        MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
+        MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
+        MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
+        MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
+        MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
+        MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
+        MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
+        MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
+        MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
+        MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
+        MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
+        MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
+        MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
+        MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
+
+        MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
+        MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
+        MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
+        MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
+        MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
+        MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
+        MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
+        MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
+        MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
+        MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
+        MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
+        MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
+        MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
+        MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
+        MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
+        MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
+
+        MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
+        MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
+        MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
+        MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
+        MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
+        MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
+        MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
+        MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
+        MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
+        MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
+        MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
+        MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
+        MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
+        MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
+        MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
+        MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
+
+        MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
+        MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
+        MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
+        MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
+        MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
+        MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
+        MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
+        MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
+        MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
+        MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
+        MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
+        MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
+        MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
+        MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
+        MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
+        MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
+
+        paddd   A,[AA]
+        paddd   B,[BB]
+        paddd   C,[CC]
+        paddd   D,[DD]
+
+                paddd   A2,[AA2]
+                paddd   B2,[BB2]
+                paddd   C2,[CC2]
+                paddd   D2,[DD2]
+
+        ; write out digests
+        movdqu  [arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE], A
+        movdqu  [arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE], B
+        movdqu  [arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE], C
+        movdqu  [arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE], D
+                movdqu  [arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2
+                movdqu  [arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2
+                movdqu  [arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2
+                movdqu  [arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2
+
+        ;; update input pointers
+        add     inp0, IDX
+        add     inp1, IDX
+        add     inp2, IDX
+        add     inp3, IDX
+        add     inp4, IDX
+        add     inp5, IDX
+        add     inp6, IDX
+        add     inp7, IDX
+        mov     [arg1 +_data_ptr_md5  + 0*PTR_SZ], inp0
+        mov     [arg1 +_data_ptr_md5  + 1*PTR_SZ], inp1
+        mov     [arg1 +_data_ptr_md5  + 2*PTR_SZ], inp2
+        mov     [arg1 +_data_ptr_md5  + 3*PTR_SZ], inp3
+        mov     [arg1 +_data_ptr_md5  + 4*PTR_SZ], inp4
+        mov     [arg1 +_data_ptr_md5  + 5*PTR_SZ], inp5
+        mov     [arg1 +_data_ptr_md5  + 6*PTR_SZ], inp6
+        mov     [arg1 +_data_ptr_md5  + 7*PTR_SZ], inp7
+
+        ;; Clear stack frame (72*16 bytes)
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+%assign i 0
+%rep (2*2*16+8)
+        movdqa	[rsp + i*16], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+        ;;;;;;;;;;;;;;;;
+        ;; Postamble
+        add     rsp, STACK_SIZE
+        ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/pon_sse.asm b/src/spdk/intel-ipsec-mb/sse/pon_sse.asm
new file mode 100644
index 000000000..32585f5f8
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/pon_sse.asm
@@ -0,0 +1,875 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "job_aes_hmac.asm"
+%include "include/os.asm"
+%include "include/memcpy.asm"
+
+;;; This is implementation of stitched algorithms: AES128-CTR + CRC32 + BIP
+;;; This combination is required by PON/xPON/gPON standard.
+;;; Note: BIP is running XOR of double words
+;;; Order of operations:
+;;; - encrypt: CRC32, AES-CTR and BIP
+;;; - decrypt: BIP, AES-CTR and CRC32
+
+%ifndef DEC_FN_NAME
+%define DEC_FN_NAME submit_job_pon_dec_sse
+%endif
+%ifndef ENC_FN_NAME
+%define ENC_FN_NAME submit_job_pon_enc_sse
+%endif
+%ifndef ENC_NO_CTR_FN_NAME
+%define ENC_NO_CTR_FN_NAME submit_job_pon_enc_no_ctr_sse
+%endif
+%ifndef DEC_NO_CTR_FN_NAME
+%define DEC_NO_CTR_FN_NAME submit_job_pon_dec_no_ctr_sse
+%endif
+
+extern byteswap_const
+extern ddq_add_1
+
+section .data
+default rel
+
+;;; Precomputed constants for CRC32 (Ethernet FCS)
+;;;   Details of the CRC algorithm and 4 byte buffer of
+;;;   {0x01, 0x02, 0x03, 0x04}:
+;;;     Result     Poly       Init        RefIn  RefOut  XorOut
+;;;     0xB63CFBCD 0x04C11DB7 0xFFFFFFFF  true   true    0xFFFFFFFF
+align 16
+rk1:
+        dq 0x00000000ccaa009e, 0x00000001751997d0
+
+align 16
+rk5:
+        dq 0x00000000ccaa009e, 0x0000000163cd6124
+
+align 16
+rk7:
+        dq 0x00000001f7011640, 0x00000001db710640
+
+align 16
+pshufb_shf_table:
+        ;;  use these values for shift registers with the pshufb instruction
+        dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+        dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+align 16
+init_crc_value:
+        dq 0x00000000FFFFFFFF, 0x0000000000000000
+
+align 16
+mask:
+        dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+
+align 16
+mask2:
+        dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+align 16
+mask3:
+        dq 0x8080808080808080, 0x8080808080808080
+
+align 16
+mask_out_top_bytes:
+        dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+        dq 0x0000000000000000, 0x0000000000000000
+
+;; Precomputed constants for HEC calculation (XGEM header)
+;; POLY 0x53900000:
+;;         k1    = 0xf9800000
+;;         k2    = 0xa0900000
+;;         k3    = 0x7cc00000
+;;         q     = 0x46b927ec
+;;         p_res = 0x53900000
+
+align 16
+k3_q:
+        dq 0x7cc00000, 0x46b927ec
+
+align 16
+p_res:
+        dq 0x53900000, 0
+
+align 16
+mask_out_top_64bits:
+        dq 0xffffffff_ffffffff, 0
+
+section .text
+
+%define NUM_AES_ROUNDS 10
+
+;; note: leave xmm0 free for implicit blend
+%define xcounter xmm7
+%define xbip    xmm1
+%define xcrc    xmm2
+%define xcrckey xmm3
+%define xtmp1   xmm4
+%define xtmp2   xmm5
+%define xtmp3   xmm6
+%define xtmp4   xmm8
+
+%ifdef LINUX
+%define arg1    rdi
+%define arg2    rsi
+%define arg3    rdx
+%define arg4    rcx
+%define tmp_1   r8
+%define tmp_2   r9
+%define tmp_3   r10
+%define tmp_4   r11
+%define tmp_5   r12
+%define tmp_6   r13
+%define tmp_7   r14
+%else
+%define arg1    rcx
+%define arg2    rdx
+%define arg3    r8
+%define arg4    r9
+%define tmp_1   r10
+%define tmp_2   r11
+%define tmp_3   rax
+%define tmp_4   r12
+%define tmp_5   r13
+%define tmp_6   r14
+%define tmp_7   r15
+%endif
+
+%define job     arg1
+
+%define p_in    arg2
+%define p_keys  arg3
+%define p_out   arg4
+
+%define num_bytes       tmp_1   ; bytes to cipher
+%define tmp             tmp_2
+%define ctr_check       tmp_3   ; counter block overflow check
+%define bytes_to_crc    tmp_4   ; number of bytes to CRC ( < num_bytes)
+
+%define ethernet_fcs    tmp_6   ; not used together with tmp3
+%define tmp2            tmp_5
+%define tmp3            tmp_6
+
+%define write_back_crc   tmp_7
+%define decrypt_not_done tmp_7
+
+;;; ============================================================================
+;;; Does all AES encryption rounds
+%macro AES_ENC_ROUNDS 3
+%define %%KP            %1      ; [in] pointer to expanded keys
+%define %%N_ROUNDS      %2      ; [in] max rounds (128bit: 10, 12, 14)
+%define %%BLOCK         %3      ; [in/out] XMM with encrypted block
+
+%assign round 0
+        pxor            %%BLOCK, [%%KP + (round * 16)]
+
+%rep (%%N_ROUNDS - 1)
+%assign round (round + 1)
+        aesenc          %%BLOCK, [%%KP + (round * 16)]
+%endrep
+
+%assign round (round + 1)
+        aesenclast      %%BLOCK, [%%KP + (round * 16)]
+
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm round on a single AES block (16 bytes):
+;;;   AES-CTR (optional, depending on %%CIPH)
+;;;   - prepares counter blocks
+;;;   - encrypts counter blocks
+;;;   - loads text
+;;;   - xor's text against encrypted blocks
+;;;   - stores cipher text
+;;;   BIP
+;;;   - BIP update on 4 x 32-bits
+;;;   CRC32
+;;;   - CRC32 calculation
+;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro
+;;;       behaviour can be achieved to match needs of the overall algorithm.
+%macro DO_PON 15
+%define %%KP            %1      ; [in] GP, pointer to expanded keys
+%define %%N_ROUNDS      %2      ; [in] number of AES rounds (10, 12 or 14)
+%define %%CTR           %3      ; [in/out] XMM with counter block
+%define %%INP           %4      ; [in/out] GP with input text pointer or "no_load"
+%define %%OUTP          %5      ; [in/out] GP with output text pointer or "no_store"
+%define %%XBIP_IN_OUT   %6      ; [in/out] XMM with BIP value or "no_bip"
+%define %%XCRC_IN_OUT   %7      ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL      %8      ; [in] XMM with CRC multiplier constant (can be anything if "no_crc" below)
+%define %%TXMM0         %9      ; [clobbered|out] XMM temporary or data out (no_store)
+%define %%TXMM1         %10     ; [clobbered|in] XMM temporary or data in (no_load)
+%define %%TXMM2         %11     ; [clobbered] XMM temporary
+%define %%CRC_TYPE      %12     ; [in] "first_crc" or "next_crc" or "no_crc"
+%define %%DIR           %13     ; [in] "ENC" or "DEC"
+%define %%CIPH          %14     ; [in] "CTR" or "NO_CTR"
+%define %%CTR_CHECK     %15     ; [in/out] GP with 64bit counter (to identify overflow)
+
+%ifidn %%CIPH, CTR
+        ;; prepare counter blocks for encryption
+        movdqa          %%TXMM0, %%CTR
+        pshufb          %%TXMM0, [rel byteswap_const]
+        ;; perform 1 increment on whole 128 bits
+        movdqa          %%TXMM2,  [rel ddq_add_1]
+        paddq           %%CTR, %%TXMM2
+        add             %%CTR_CHECK, 1
+        jnc             %%_no_ctr_overflow
+        ;; Add 1 to the top 64 bits. First shift left value 1 by 64 bits.
+        pslldq          %%TXMM2, 8
+        paddq           %%CTR, %%TXMM2
+%%_no_ctr_overflow:
+%endif
+        ;; CRC calculation
+%ifidn %%CRC_TYPE, next_crc
+        movdqa          %%TXMM2, %%XCRC_IN_OUT
+        pclmulqdq       %%TXMM2, %%XCRC_MUL, 0x01
+        pclmulqdq       %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+%endif
+
+%ifnidn %%INP, no_load
+        movdqu          %%TXMM1, [%%INP]
+%endif
+
+%ifidn %%CIPH, CTR
+        ;; AES rounds
+        AES_ENC_ROUNDS  %%KP, %%N_ROUNDS, %%TXMM0
+
+        ;; xor plaintext/ciphertext against encrypted counter blocks
+        pxor            %%TXMM0, %%TXMM1
+%else ;; CIPH = NO_CTR
+        ;; if no encryption needs to be done, move from input to output reg
+        movdqa          %%TXMM0, %%TXMM1
+%endif ;; CIPH = CTR
+
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+        ;; CRC calculation for ENCRYPTION
+%ifidn %%CRC_TYPE, first_crc
+        ;; in the first run just XOR initial CRC with the first block
+        pxor            %%XCRC_IN_OUT, %%TXMM1
+%endif
+%ifidn %%CRC_TYPE, next_crc
+        ;; - XOR results of CLMUL's together
+        ;; - then XOR against text block
+        pxor            %%XCRC_IN_OUT, %%TXMM2
+        pxor            %%XCRC_IN_OUT, %%TXMM1
+%endif
+%else
+        ;; CRC calculation for DECRYPTION
+%ifidn %%CRC_TYPE, first_crc
+        ;; in the first run just XOR initial CRC with the first block
+        pxor            %%XCRC_IN_OUT, %%TXMM0
+%endif
+%ifidn %%CRC_TYPE, next_crc
+        ;; - XOR results of CLMUL's together
+        ;; - then XOR against text block
+        pxor            %%XCRC_IN_OUT, %%TXMM2
+        pxor            %%XCRC_IN_OUT, %%TXMM0
+%endif
+%endif                        ; DECRYPT
+%else ;; CIPH = NO_CTR
+        ;; CRC calculation for DECRYPTION
+%ifidn %%CRC_TYPE, first_crc
+        ;; in the first run just XOR initial CRC with the first block
+        pxor            %%XCRC_IN_OUT, %%TXMM1
+%endif
+%ifidn %%CRC_TYPE, next_crc
+        ;; - XOR results of CLMUL's together
+        ;; - then XOR against text block
+        pxor            %%XCRC_IN_OUT, %%TXMM2
+        pxor            %%XCRC_IN_OUT, %%TXMM1
+%endif
+
+%endif ;; CIPH = CTR
+
+        ;; store the result in the output buffer
+%ifnidn %%OUTP, no_store
+        movdqu          [%%OUTP], %%TXMM0
+%endif
+
+        ;; update BIP value - always use cipher text for BIP
+%ifidn %%DIR, ENC
+%ifnidn %%XBIP_IN_OUT, no_bip
+        pxor            %%XBIP_IN_OUT, %%TXMM0
+%endif
+%else
+%ifnidn %%XBIP_IN_OUT, no_bip
+        pxor            %%XBIP_IN_OUT, %%TXMM1
+%endif
+%endif                          ; DECRYPT
+
+        ;; increment in/out pointers
+%ifnidn %%INP, no_load
+        add             %%INP,  16
+%endif
+%ifnidn %%OUTP, no_store
+        add             %%OUTP, 16
+%endif
+%endmacro                       ; DO_PON
+
+;;; ============================================================================
+;;; CIPHER and BIP specified number of bytes
+%macro CIPHER_BIP_REST 14
+%define %%NUM_BYTES   %1        ; [in/clobbered] number of bytes to cipher
+%define %%DIR         %2        ; [in] "ENC" or "DEC"
+%define %%CIPH        %3        ; [in] "CTR" or "NO_CTR"
+%define %%PTR_IN      %4        ; [in/clobbered] GPR pointer to input buffer
+%define %%PTR_OUT     %5        ; [in/clobbered] GPR pointer to output buffer
+%define %%PTR_KEYS    %6        ; [in] GPR pointer to expanded keys
+%define %%XBIP_IN_OUT %7        ; [in/out] XMM 128-bit BIP state
+%define %%XCTR_IN_OUT %8        ; [in/out] XMM 128-bit AES counter block
+%define %%XMMT1       %9        ; [clobbered] temporary XMM
+%define %%XMMT2       %10       ; [clobbered] temporary XMM
+%define %%XMMT3       %11       ; [clobbered] temporary XMM
+%define %%CTR_CHECK   %12       ; [in/out] GP with 64bit counter (to identify overflow)
+%define %%GPT1        %13       ; [clobbered] temporary GP
+%define %%GPT2        %14       ; [clobbered] temporary GP
+
+%%_cipher_last_blocks:
+        cmp     %%NUM_BYTES, 16
+        jb      %%_partial_block_left
+
+        DO_PON  %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, %%PTR_IN, %%PTR_OUT, %%XBIP_IN_OUT, \
+                no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+        sub     %%NUM_BYTES, 16
+        jz      %%_bip_done
+        jmp     %%_cipher_last_blocks
+
+%%_partial_block_left:
+        simd_load_sse_15_1 %%XMMT2, %%PTR_IN, %%NUM_BYTES
+
+        ;; DO_PON() is not loading nor storing the data in this case:
+        ;; XMMT2 = data in
+        ;; XMMT1 = data out
+        DO_PON  %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, no_load, no_store, no_bip, \
+                no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+
+        ;; BIP update for partial block (mask out bytes outside the message)
+        lea     %%GPT1, [rel mask_out_top_bytes + 16]
+        sub     %%GPT1, %%NUM_BYTES
+        movdqu  %%XMMT3, [%%GPT1]
+        ;; put masked cipher text into XMMT2 for BIP update
+%ifidn %%DIR, ENC
+        movdqa  %%XMMT2, %%XMMT1
+        pand    %%XMMT2, %%XMMT3
+%else
+        pand    %%XMMT2, %%XMMT3
+%endif
+        pxor    %%XBIP_IN_OUT, %%XMMT2
+
+        ;; store partial bytes in the output buffer
+        simd_store_sse_15 %%PTR_OUT, %%XMMT1, %%NUM_BYTES, %%GPT1, %%GPT2
+
+%%_bip_done:
+%endmacro                       ; CIPHER_BIP_REST
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial
+
+%macro CRC32_REDUCE_128_TO_32 5
+%define %%CRC   %1         ; [out] GP to store 32-bit Ethernet FCS value
+%define %%XCRC  %2         ; [in/clobbered] XMM with CRC
+%define %%XT1   %3         ; [clobbered] temporary xmm register
+%define %%XT2   %4         ; [clobbered] temporary xmm register
+%define %%XT3   %5         ; [clobbered] temporary xmm register
+
+%define %%XCRCKEY %%XT3
+
+        ;;  compute CRC of a 128-bit value
+        movdqa          %%XCRCKEY, [rel rk5]
+
+        ;; 64b fold
+        movdqa          %%XT1, %%XCRC
+        pclmulqdq       %%XT1, %%XCRCKEY, 0x00
+        psrldq          %%XCRC, 8
+        pxor            %%XCRC, %%XT1
+
+        ;; 32b fold
+        movdqa          %%XT1, %%XCRC
+        pslldq          %%XT1, 4
+        pclmulqdq       %%XT1, %%XCRCKEY, 0x10
+        pxor            %%XCRC, %%XT1
+
+%%_crc_barrett:
+        ;; Barrett reduction
+        pand            %%XCRC, [rel mask2]
+        movdqa          %%XT1, %%XCRC
+        movdqa          %%XT2, %%XCRC
+        movdqa          %%XCRCKEY, [rel rk7]
+
+        pclmulqdq       %%XCRC, %%XCRCKEY, 0x00
+        pxor            %%XCRC, %%XT2
+        pand            %%XCRC, [rel mask]
+        movdqa          %%XT2, %%XCRC
+        pclmulqdq       %%XCRC, %%XCRCKEY, 0x10
+        pxor            %%XCRC, %%XT2
+        pxor            %%XCRC, %%XT1
+        pextrd          DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value
+        not             DWORD(%%CRC)
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_128_TO_32 4
+%define %%XMM_IN_OUT %1         ; [in/out] xmm register with data in and out
+%define %%XT1        %2         ; [clobbered] temporary xmm register
+%define %%XT2        %3         ; [clobbered] temporary xmm register
+%define %%XT3        %4         ; [clobbered] temporary xmm register
+
+%define %%K3_Q  %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP  %%XT3
+
+        ;; 128 to 64 bit reduction
+        movdqa          %%K3_Q,  [k3_q]
+        movdqa          %%P_RES, [p_res]
+
+        movdqa          %%XTMP, %%XMM_IN_OUT
+        pclmulqdq       %%XTMP, %%K3_Q, 0x01 ; K3
+        pxor            %%XTMP, %%XMM_IN_OUT
+
+        pclmulqdq       %%XTMP, %%K3_Q, 0x01 ; K3
+        pxor            %%XMM_IN_OUT, %%XTMP
+
+        pand            %%XMM_IN_OUT, [rel mask_out_top_64bits]
+
+        ;; 64 to 32 bit reduction
+        movdqa          %%XTMP, %%XMM_IN_OUT
+        psrldq          %%XTMP, 4
+        pclmulqdq       %%XTMP, %%K3_Q, 0x10 ; Q
+        pxor            %%XTMP, %%XMM_IN_OUT
+        psrldq          %%XTMP, 4
+
+        pclmulqdq       %%XTMP, %%P_RES, 0x00 ; P
+        pxor            %%XMM_IN_OUT, %%XTMP
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 64-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_64_TO_32 4
+%define %%XMM_IN_OUT %1         ; [in/out] xmm register with data in and out
+%define %%XT1        %2         ; [clobbered] temporary xmm register
+%define %%XT2        %3         ; [clobbered] temporary xmm register
+%define %%XT3        %4         ; [clobbered] temporary xmm register
+
+%define %%K3_Q  %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP  %%XT3
+
+        movdqa          %%K3_Q,  [k3_q]
+        movdqa          %%P_RES, [p_res]
+
+        ;; 64 to 32 bit reduction
+        movdqa          %%XTMP, %%XMM_IN_OUT
+        psrldq          %%XTMP, 4
+        pclmulqdq       %%XTMP, %%K3_Q, 0x10 ; Q
+        pxor            %%XTMP, %%XMM_IN_OUT
+        psrldq          %%XTMP, 4
+
+        pclmulqdq       %%XTMP, %%P_RES, 0x00 ; P
+        pxor            %%XMM_IN_OUT, %%XTMP
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 32-bit XGEM headers
+%macro HEC_COMPUTE_32 6
+%define %%HEC_IN_OUT %1         ; [in/out] GP register with HEC in LE format
+%define %%GT1        %2         ; [clobbered] temporary GP register
+%define %%XT1        %4         ; [clobbered] temporary xmm register
+%define %%XT2        %5         ; [clobbered] temporary xmm register
+%define %%XT3        %6         ; [clobbered] temporary xmm register
+%define %%XT4        %7         ; [clobbered] temporary xmm register
+
+        mov             DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+        ;; shift out 13 bits of HEC value for CRC computation
+        shr             DWORD(%%GT1), 13
+
+        ;; mask out current HEC value to merge with an updated HEC at the end
+        and             DWORD(%%HEC_IN_OUT), 0xffff_e000
+
+        ;; prepare the message for CRC computation
+        movd            %%XT1, DWORD(%%GT1)
+        pslldq          %%XT1, 4         ; shift left by 32-bits
+
+        HEC_REDUCE_64_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+        ;; extract 32-bit value
+        ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+        movd            DWORD(%%GT1), %%XT1
+        shr             DWORD(%%GT1), (20 - 1)
+
+        ;; merge header bytes with updated 12-bit CRC value and
+        ;; compute parity
+        or              DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+        popcnt          DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+        and             DWORD(%%HEC_IN_OUT), 1
+        or              DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 64-bit XGEM headers
+%macro HEC_COMPUTE_64 6
+%define %%HEC_IN_OUT %1         ; [in/out] GP register with HEC in LE format
+%define %%GT1        %2         ; [clobbered] temporary GP register
+%define %%XT1        %3         ; [clobbered] temporary xmm register
+%define %%XT2        %4         ; [clobbered] temporary xmm register
+%define %%XT3        %5         ; [clobbered] temporary xmm register
+%define %%XT4        %6         ; [clobbered] temporary xmm register
+
+        mov             %%GT1, %%HEC_IN_OUT
+        ;; shift out 13 bits of HEC value for CRC computation
+        shr             %%GT1, 13
+
+        ;; mask out current HEC value to merge with an updated HEC at the end
+        and             %%HEC_IN_OUT, 0xffff_ffff_ffff_e000
+
+        ;; prepare the message for CRC computation
+        movq            %%XT1, %%GT1
+        pslldq          %%XT1, 4         ; shift left by 32-bits
+
+        HEC_REDUCE_128_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+        ;; extract 32-bit value
+        ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+        movd            DWORD(%%GT1), %%XT1
+        shr             DWORD(%%GT1), (20 - 1)
+
+        ;; merge header bytes with updated 12-bit CRC value and
+        ;; compute parity
+        or              %%GT1, %%HEC_IN_OUT
+        popcnt          %%HEC_IN_OUT, %%GT1
+        and             %%HEC_IN_OUT, 1
+        or              %%HEC_IN_OUT, %%GT1
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm of AES128-CTR, CRC and BIP
+;;; - this is master macro that implements encrypt/decrypt API
+;;; - calls other macros and directly uses registers
+;;;   defined at the top of the file
+%macro AES128_CTR_PON 2
+%define %%DIR   %1              ; [in] direction "ENC" or "DEC"
+%define %%CIPH  %2              ; [in] cipher "CTR" or "NO_CTR"
+
+        push    r12
+        push    r13
+        push    r14
+%ifndef LINUX
+        push    r15
+%endif
+
+%ifidn %%DIR, ENC
+        ;; by default write back CRC for encryption
+        mov     DWORD(write_back_crc), 1
+%else
+        ;; mark decryption as finished
+        mov     DWORD(decrypt_not_done), 1
+%endif
+        ;; START BIP (and update HEC if encrypt direction)
+        ;; - load XGEM header (8 bytes) for BIP (not part of encrypted payload)
+        ;; - convert it into LE
+        ;; - update HEC field in the header
+        ;; - convert it into BE
+        ;; - store back the header (with updated HEC)
+        ;; - start BIP
+        ;; (free to use tmp_1, tmp_2 and tmp_3 at this stage)
+        mov     tmp_2, [job + _src]
+        add     tmp_2, [job + _hash_start_src_offset_in_bytes]
+        mov     tmp_3, [tmp_2]
+%ifidn %%DIR, ENC
+        bswap   tmp_3                   ; go to LE
+        HEC_COMPUTE_64 tmp_3, tmp_1, xtmp1, xtmp2, xtmp3, xtmp4
+        mov     bytes_to_crc, tmp_3
+        shr     bytes_to_crc, (48 + 2)  ; PLI = MSB 14 bits
+        bswap   tmp_3                   ; go back to BE
+        mov     [tmp_2], tmp_3
+        movq    xbip, tmp_3
+%else
+        movq    xbip, tmp_3
+        mov     bytes_to_crc, tmp_3
+        bswap   bytes_to_crc            ; go to LE
+        shr     bytes_to_crc, (48 + 2)  ; PLI = MSB 14 bits
+%endif
+        cmp     bytes_to_crc, 4
+        ja      %%_crc_not_zero
+        ;; XGEM payload shorter or equal to 4 bytes
+%ifidn %%DIR, ENC
+        ;; Don't write Ethernet FCS on encryption
+       xor     DWORD(write_back_crc), DWORD(write_back_crc)
+%else
+        ;; Mark decryption as not finished
+        ;; - Ethernet FCS is not computed
+        ;; - decrypt + BIP to be done at the end
+        xor     DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+%endif
+        mov     DWORD(bytes_to_crc), 4  ; it will be zero after the sub (avoid jmp)
+%%_crc_not_zero:
+        sub     bytes_to_crc, 4         ; subtract size of the CRC itself
+
+%ifidn %%CIPH, CTR
+        ;; - read 16 bytes of IV
+        ;; - convert to little endian format
+        ;; - save least significant 8 bytes in GP register for overflow check
+        mov     tmp, [job + _iv]
+        movdqu  xcounter, [tmp]
+        pshufb  xcounter, [rel byteswap_const]
+        movq    ctr_check, xcounter
+%endif
+
+        ;; get input buffer (after XGEM header)
+        mov     p_in, [job + _src]
+        add     p_in, [job + _cipher_start_src_offset_in_bytes]
+
+        ;; get output buffer
+        mov     p_out, [job + _dst]
+
+%ifidn %%CIPH, CTR
+        ;; get key pointers
+        mov     p_keys, [job + _aes_enc_key_expanded]
+%endif
+
+        ;; initial CRC value
+        movdqa  xcrc, [rel init_crc_value]
+
+        ;; load CRC constants
+        movdqa  xcrckey, [rel rk1] ; rk1 and rk2 in xcrckey
+
+        ;; get number of bytes to cipher
+%ifidn %%CIPH, CTR
+        mov     num_bytes, [job + _msg_len_to_cipher_in_bytes]
+%else
+        ;; Message length to cipher is 0
+        ;; - length is obtained from message length to hash (BIP) minus XGEM header size
+        mov     num_bytes, [job + _msg_len_to_hash_in_bytes]
+        sub     num_bytes, 8
+%endif
+        or      bytes_to_crc, bytes_to_crc
+        jz      %%_crc_done
+
+        cmp     bytes_to_crc, 32
+        jae     %%_at_least_32_bytes
+
+%ifidn %%DIR, DEC
+        ;; decrypt the buffer first
+        mov     tmp, num_bytes
+        CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+                        xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+        ;; correct in/out pointers - go back to start of the buffers
+        mov     tmp, num_bytes
+        and     tmp, -16        ; partial block handler doesn't increment pointers
+        sub     p_in, tmp
+        sub     p_out, tmp
+%endif                          ; DECRYPTION
+
+        ;; less than 32 bytes
+        cmp     bytes_to_crc, 16
+        je      %%_exact_16_left
+        jl      %%_less_than_16_left
+        ;; load the plaintext
+%ifidn %%DIR, ENC
+        movdqu  xtmp1, [p_in]
+%else
+        movdqu  xtmp1, [p_out]
+%endif
+        pxor    xcrc, xtmp1   ; xor the initial crc value
+        jmp     %%_crc_two_xmms
+
+%%_exact_16_left:
+%ifidn %%DIR, ENC
+        movdqu  xtmp1, [p_in]
+%else
+        movdqu  xtmp1, [p_out]
+%endif
+        pxor    xcrc, xtmp1 ; xor the initial CRC value
+        jmp     %%_128_done
+
+%%_less_than_16_left:
+%ifidn %%DIR, ENC
+        simd_load_sse_15_1 xtmp1, p_in, bytes_to_crc
+%else
+        simd_load_sse_15_1 xtmp1, p_out, bytes_to_crc
+%endif
+        pxor    xcrc, xtmp1 ; xor the initial CRC value
+
+        lea     tmp, [rel pshufb_shf_table]
+        movdqu  xtmp1, [tmp + bytes_to_crc]
+        pshufb  xcrc, xtmp1
+        jmp     %%_128_done
+
+%%_at_least_32_bytes:
+        DO_PON  p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+                xcrc, xcrckey, xtmp1, xtmp2, xtmp3, first_crc, %%DIR, %%CIPH, ctr_check
+        sub     num_bytes, 16
+        sub     bytes_to_crc, 16
+
+%%_main_loop:
+        cmp     bytes_to_crc, 16
+        jb      %%_exit_loop
+        DO_PON  p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+                xcrc, xcrckey, xtmp1, xtmp2, xtmp3, next_crc, %%DIR, %%CIPH, ctr_check
+        sub     num_bytes, 16
+        sub     bytes_to_crc, 16
+%ifidn %%DIR, ENC
+        jz      %%_128_done
+%endif
+        jmp     %%_main_loop
+
+%%_exit_loop:
+
+%ifidn %%DIR, DEC
+        ;; decrypt rest of the message including CRC and optional padding
+        mov     tmp, num_bytes
+
+        CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+                        xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+        mov     tmp, num_bytes  ; correct in/out pointers - to point before cipher & BIP
+        and     tmp, -16        ; partial block handler doesn't increment pointers
+        sub     p_in, tmp
+        sub     p_out, tmp
+
+        or      bytes_to_crc, bytes_to_crc
+        jz      %%_128_done
+%endif                          ; DECRYPTION
+
+        ;; Partial bytes left - complete CRC calculation
+%%_crc_two_xmms:
+        lea             tmp, [rel pshufb_shf_table]
+        movdqu          xtmp2, [tmp + bytes_to_crc]
+%ifidn %%DIR, ENC
+        movdqu          xtmp1, [p_in - 16 + bytes_to_crc]  ; xtmp1 = data for CRC
+%else
+        movdqu          xtmp1, [p_out - 16 + bytes_to_crc]  ; xtmp1 = data for CRC
+%endif
+        movdqa          xtmp3, xcrc
+        pshufb          xcrc, xtmp2  ; top num_bytes with LSB xcrc
+        pxor            xtmp2, [rel mask3]
+        pshufb          xtmp3, xtmp2 ; bottom (16 - num_bytes) with MSB xcrc
+
+        ;; data num_bytes (top) blended with MSB bytes of CRC (bottom)
+        movdqa          xmm0, xtmp2
+        pblendvb        xtmp3, xtmp1 ; xmm0 implicit
+
+        ;; final CRC calculation
+        movdqa          xtmp1, xcrc
+        pclmulqdq       xtmp1, xcrckey, 0x01
+        pclmulqdq       xcrc, xcrckey, 0x10
+        pxor            xcrc, xtmp3
+        pxor            xcrc, xtmp1
+
+%%_128_done:
+        CRC32_REDUCE_128_TO_32 ethernet_fcs, xcrc, xtmp1, xtmp2, xcrckey
+
+%%_crc_done:
+        ;; @todo - store-to-load problem in ENC case (to be fixed later)
+        ;; - store CRC in input buffer and authentication tag output
+        ;; - encrypt remaining bytes
+%ifidn %%DIR, ENC
+        or      DWORD(write_back_crc), DWORD(write_back_crc)
+        jz      %%_skip_crc_write_back
+        mov     [p_in + bytes_to_crc], DWORD(ethernet_fcs)
+%%_skip_crc_write_back:
+%endif
+        mov     tmp, [job + _auth_tag_output]
+        mov     [tmp + 4], DWORD(ethernet_fcs)
+
+        or      num_bytes, num_bytes
+        jz      %%_do_not_cipher_the_rest
+
+        ;; encrypt rest of the message
+        ;; - partial bytes including CRC and optional padding
+        ;; decrypt rest of the message
+        ;; - this may only happen when XGEM payload is short and padding is added
+%ifidn %%DIR, DEC
+        or      DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+        jnz     %%_do_not_cipher_the_rest
+%endif
+        CIPHER_BIP_REST num_bytes, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+                        xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+%%_do_not_cipher_the_rest:
+
+        ;; finalize BIP
+        movdqa  xtmp1, xbip
+        movdqa  xtmp2, xbip
+        movdqa  xtmp3, xbip
+        psrldq  xtmp1, 4
+        psrldq  xtmp2, 8
+        psrldq  xtmp3, 12
+        pxor    xtmp1, xtmp2
+        pxor    xbip, xtmp3
+        pxor    xbip, xtmp1
+        movd    [tmp], xbip
+
+        ;; set job status
+        or      dword [job + _status], STS_COMPLETED
+
+        ;;  return job
+        mov     rax, job
+
+%ifndef LINUX
+        pop     r15
+%endif
+        pop     r14
+        pop     r13
+        pop     r12
+%endmacro                       ; AES128_CTR_PON
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; aes_cntr_128_pon_enc_sse(JOB_AES_HMAC *job)
+align 32
+MKGLOBAL(ENC_FN_NAME,function,internal)
+ENC_FN_NAME:
+        AES128_CTR_PON ENC, CTR
+        ret
+
+;;; aes_cntr_128_pon_dec_sse(JOB_AES_HMAC *job)
+align 32
+MKGLOBAL(DEC_FN_NAME,function,internal)
+DEC_FN_NAME:
+        AES128_CTR_PON DEC, CTR
+        ret
+
+;;; aes_cntr_128_pon_enc_no_ctr_sse(JOB_AES_HMAC *job)
+align 32
+MKGLOBAL(ENC_NO_CTR_FN_NAME,function,internal)
+ENC_NO_CTR_FN_NAME:
+        AES128_CTR_PON ENC, NO_CTR
+        ret
+
+;;; aes_cntr_128_pon_dec_no_ctr_sse(JOB_AES_HMAC *job)
+align 32
+MKGLOBAL(DEC_NO_CTR_FN_NAME,function,internal)
+DEC_NO_CTR_FN_NAME:
+        AES128_CTR_PON DEC, NO_CTR
+        ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm
new file mode 100644
index 000000000..355a38906
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm
@@ -0,0 +1,435 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%include "mb_mgr_datastruct.asm"
+
+section .data
+default rel
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19:                  ;ddq 0x5A8279995A8279995A8279995A827999
+	dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                  ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+	dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                  ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+	dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                  ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+	dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+;; code to compute quad SHA1 using SSE
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
+;; rbx, rsi, rdi, rbp, r12-r15 left intact
+;; This version is not safe to call from C/C++
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers:  rax         rdx             r8 r9 r10 r11
+;; Windows preserves:     rbx rcx     rsi rdi rbp               r12 r13 r14 r15
+;;
+;; Linux clobbers:    rax             rsi         r8 r9 r10 r11
+;; Linux preserves:       rbx rcx rdx     rdi rbp               r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	movaps	%%t0, %%r0		; t0 = {a3 a2 a1 a0}
+	shufps	%%t0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
+	shufps	%%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
+
+	movaps	%%t1, %%r2		; t1 = {c3 c2 c1 c0}
+	shufps	%%t1, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
+	shufps	%%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
+
+	movaps	%%r1, %%t0		; r1 = {b1 b0 a1 a0}
+	shufps	%%r1, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
+
+	movaps	%%r3, %%r0		; r3 = {b3 b2 a3 a2}
+	shufps	%%r3, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
+
+	shufps	%%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
+	shufps	%%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T   ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+	movdqa  %%regF,%%regC
+	pxor  %%regF,%%regD
+	pand  %%regF,%%regB
+	pxor  %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+	movdqa  %%regF,%%regD
+	pxor  %%regF,%%regC
+	pxor  %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T   ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+	movdqa  %%regF,%%regB
+	movdqa  %%regT,%%regB
+	por   %%regF,%%regC
+	pand  %%regT,%%regC
+	pand  %%regF,%%regD
+	por   %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+	MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa	%%tmp, %%reg
+	pslld	%%reg, %%imm
+	psrld	%%tmp, (32-%%imm)
+	por	%%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA	%1
+%define %%regB	%2
+%define %%regC	%3
+%define %%regD	%4
+%define %%regE	%5
+%define %%regT	%6
+%define %%regF	%7
+%define %%memW	%8
+%define %%immCNT %9
+%define %%MAGIC	%10
+	paddd	%%regE,%%immCNT
+	paddd	%%regE,[rsp + (%%memW * 16)]
+	movdqa	%%regT,%%regA
+	PROLD	%%regT,5, %%regF
+	paddd	%%regE,%%regT
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	paddd	%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA	%1
+%define %%regB	%2
+%define %%regC	%3
+%define %%regD	%4
+%define %%regE	%5
+%define %%regT	%6
+%define %%regF	%7
+%define %%memW	%8
+%define %%immCNT %9
+%define %%MAGIC	%10
+	paddd	%%regE,%%immCNT
+	movdqa	W14, [rsp + ((%%memW - 14) & 15) * 16]
+	pxor	W16, W14
+	pxor	W16, [rsp + ((%%memW -  8) & 15) * 16]
+	pxor	W16, [rsp + ((%%memW -  3) & 15) * 16]
+	movdqa	%%regF, W16
+	pslld	W16, 1
+	psrld	%%regF, (32-1)
+	por	%%regF, W16
+	ROTATE_W
+
+	movdqa	[rsp + ((%%memW - 0) & 15) * 16],%%regF
+	paddd	%%regE,%%regF
+	movdqa	%%regT,%%regA
+	PROLD	%%regT,5, %%regF
+	paddd	%%regE,%%regT
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	paddd	%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ must be an odd multiple of 8
+%define FRAMESZ	16*16 + 8
+
+%define MOVPS	movdqu
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%else
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX  rax
+
+%define A	xmm0
+%define B	xmm1
+%define C	xmm2
+%define D	xmm3
+%define E	xmm4
+%define F	xmm5 ; tmp
+%define G       xmm6 ; tmp
+
+%define TMP	G
+%define FUN	F
+%define K	xmm7
+
+%define AA	xmm8
+%define BB	xmm9
+%define CC	xmm10
+%define DD	xmm11
+%define EE	xmm12
+
+%define T0	xmm6
+%define T1	xmm7
+%define T2	xmm8
+%define T3	xmm9
+%define T4	xmm10
+%define T5	xmm11
+
+%define W14	xmm13
+%define W15	xmm14
+%define W16	xmm15
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+align 32
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void sha1_mult_sse(SHA1_ARGS *args, UINT32 size_in_blocks);
+; arg 1 : rcx : pointer to args
+; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
+MKGLOBAL(sha1_mult_sse,function,internal)
+sha1_mult_sse:
+
+	sub	rsp, FRAMESZ
+
+	;; Initialize digests
+	movdqa	A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE]
+	movdqa	B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE]
+	movdqa	C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE]
+	movdqa	D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE]
+	movdqa	E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE]
+	DBGPRINTL_XMM "Sha1-SSE Incoming transposed digest", A, B, C, D, E
+        ;; load input pointers
+	mov	inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ]
+	mov	inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ]
+	mov	inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ]
+	mov	inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ]
+        DBGPRINTL64 "Sha1-SSE Incoming data ptrs", inp0, inp1, inp2, inp3
+	xor	IDX, IDX
+lloop:
+	movdqa	F, [rel PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+	MOVPS	T2,[inp0+IDX]
+	MOVPS	T1,[inp1+IDX]
+	MOVPS	T4,[inp2+IDX]
+	MOVPS	T3,[inp3+IDX]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+        DBGPRINTL_XMM "sha1 incoming data", T0, T1, T2, T3
+	pshufb	T0, F
+	movdqa	[rsp+(I*4+0)*16],T0
+	pshufb	T1, F
+	movdqa	[rsp+(I*4+1)*16],T1
+	pshufb	T2, F
+	movdqa	[rsp+(I*4+2)*16],T2
+	pshufb	T3, F
+	movdqa	[rsp+(I*4+3)*16],T3
+	add	IDX, 4*4
+%assign I (I+1)
+%endrep
+
+	; save old digests
+	movdqa	AA, A
+	movdqa	BB, B
+	movdqa	CC, C
+	movdqa	DD, D
+	movdqa	EE, E
+
+;;
+;; perform 0-79 steps
+;;
+	movdqa	K, [rel K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+	movdqa	W16, [rsp + ((16 - 16) & 15) * 16]
+	movdqa	W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+	movdqa	K, [rel K20_39]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+	movdqa	K, [rel K40_59]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+	movdqa	K, [rel K60_79]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+	paddd	A,AA
+	paddd	B,BB
+	paddd	C,CC
+	paddd	D,DD
+	paddd	E,EE
+
+	sub	arg2, 1
+	jne	lloop
+
+	; write out digests
+	movdqa	[arg1 + 0*SHA1_DIGEST_ROW_SIZE], A
+	movdqa	[arg1 + 1*SHA1_DIGEST_ROW_SIZE], B
+	movdqa	[arg1 + 2*SHA1_DIGEST_ROW_SIZE], C
+	movdqa	[arg1 + 3*SHA1_DIGEST_ROW_SIZE], D
+	movdqa	[arg1 + 4*SHA1_DIGEST_ROW_SIZE], E
+        DBGPRINTL_XMM "Sha1 Outgoing transposed digest", A, B, C, D, E
+	; update input pointers
+	add	inp0, IDX
+	mov	[arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0
+	add	inp1, IDX
+	mov	[arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1
+	add	inp2, IDX
+	mov	[arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2
+	add	inp3, IDX
+	mov	[arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3
+        DBGPRINTL64 "Sha1-sse outgoing data ptrs", inp0, inp1, inp2, inp3
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+        ;; Clear stack frame (16*16 bytes)
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+%assign i 0
+%rep 16
+        movdqa	[rsp + i*16], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+	add	rsp, FRAMESZ
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm
new file mode 100644
index 000000000..c02c88eed
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm
@@ -0,0 +1,493 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers:		RAX RBX RCX RDX RBP RSI RDI R8  R9  R10 R11 R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Windows clobbers:	            RDX                     R10 R11
+;; Windows preserves:	RAX RBX RCX     RBP RSI RDI R8  R9          R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Linux clobbers:	                        RDI         R10 R11
+;; Linux preserves:	RAX RBX RCX RDX RBP RSI     R8  R9          R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%include "mb_mgr_datastruct.asm"
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define arg3	rcx
+%define arg4	rdx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define arg3	rdi
+%define arg4	rsi
+%endif
+
+%define args            arg1
+%define NUM_BLKS 	arg2
+
+; reso = resdq => 16 bytes
+struc frame
+.ABCD_SAVE	reso	1
+.E_SAVE		reso	1
+.ABCD_SAVEb	reso	1
+.E_SAVEb	reso	1
+.align		resq	1
+endstruc
+
+%define INP		r10
+%define INPb		r11
+
+%define ABCD		xmm0
+%define E0		xmm1	; Need two E's b/c they ping pong
+%define E1		xmm2
+%define MSG0		xmm3
+%define MSG1		xmm4
+%define MSG2		xmm5
+%define MSG3		xmm6
+
+%define ABCDb		xmm7
+%define E0b		xmm8	; Need two E's b/c they ping pong
+%define E1b		xmm9
+%define MSG0b		xmm10
+%define MSG1b		xmm11
+%define MSG2b		xmm12
+%define MSG3b		xmm13
+
+%define SHUF_MASK	xmm14
+%define E_MASK		xmm15
+
+section .data
+default rel
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x000102030405060708090a0b0c0d0e0f
+	dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+UPPER_WORD_MASK:         ;ddq 0xFFFFFFFF000000000000000000000000
+	dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha1_ni(SHA1_ARGS *args, UINT32 size_in_blocks)
+;; arg1 : pointer to args
+;; arg2 : size (in blocks) ;; assumed to be >= 1
+
+section .text
+MKGLOBAL(sha1_ni,function,internal)
+align 32
+sha1_ni:
+	sub		rsp, frame_size
+
+        DBGPRINTL "enter sha1-ni-x2"
+
+	shl		NUM_BLKS, 6	; convert to bytes
+	jz		done_hash
+
+	;; load input pointers
+	mov		INP, [args + _data_ptr_sha1 + 0*PTR_SZ]
+	DBGPRINTL64 "jobA: pointer", INP
+	mov		INPb, [args + _data_ptr_sha1 + 1*PTR_SZ]
+
+	add		NUM_BLKS, INP	; pointer to end of data block -> loop exit condition
+
+	;; load initial digest
+	movdqu		ABCD, [args + 0*SHA1NI_DIGEST_ROW_SIZE]
+	pxor		E0, E0
+	pinsrd		E0, [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3
+	pshufd		ABCD, ABCD, 0x1B
+
+        DBGPRINTL_XMM	"jobA: digest in words[0-3]", ABCD
+        DBGPRINTL_XMM	"jobA: digest in word 4", E0
+
+	 movdqu		 ABCDb, [args + 1*SHA1NI_DIGEST_ROW_SIZE]
+	 pxor		 E0b, E0b
+	 pinsrd		 E0b,   [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3
+	 pshufd		 ABCDb, ABCDb, 0x1B
+
+	movdqa		SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+	movdqa		E_MASK, [rel UPPER_WORD_MASK]
+
+	DBGPRINTL "jobA data:"
+loop0:
+	;; Copy digests
+	movdqa		[rsp + frame.ABCD_SAVE], ABCD
+	movdqa		[rsp + frame.E_SAVE],    E0
+	 movdqa		 [rsp + frame.ABCD_SAVEb], ABCDb
+	 movdqa		 [rsp + frame.E_SAVEb],    E0b
+
+	;; Only needed if not using sha1nexte for rounds 0-3
+	pand		E0,   E_MASK
+	 pand		 E0b,   E_MASK
+
+	;; Needed if using sha1nexte for rounds 0-3
+	;; Need to rotate E right by 30
+	;movdqa		E1, E0
+	;psrld		E0, 30
+	;pslld		E1, 2
+	;pxor		E0, E1
+
+	;; Rounds 0-3
+	movdqu		MSG0, [INP + 0*16]
+	pshufb		MSG0, SHUF_MASK
+        DBGPRINT_XMM	MSG0
+		;sha1nexte	E0, MSG0
+		paddd		E0, MSG0 ; instead of sha1nexte
+		movdqa		E1, ABCD
+		sha1rnds4	ABCD, E0, 0
+	 movdqu		 MSG0b, [INPb + 0*16]
+	 pshufb		 MSG0b, SHUF_MASK
+		 ;sha1nexte	 E0b, MSG0b
+		 paddd		 E0b, MSG0b ; instead of sha1nexte
+		 movdqa		 E1b, ABCDb
+		 sha1rnds4	 ABCDb, E0b, 0
+
+	;; Rounds 4-7
+	movdqu		MSG1, [INP + 1*16]
+	pshufb		MSG1, SHUF_MASK
+        DBGPRINT_XMM	MSG1
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+		sha1rnds4	ABCD, E1, 0
+	sha1msg1	MSG0, MSG1
+	 movdqu		 MSG1b, [INPb + 1*16]
+	 pshufb		 MSG1b, SHUF_MASK
+		 sha1nexte	 E1b, MSG1b
+		 movdqa		 E0b, ABCDb
+		 sha1rnds4	 ABCDb, E1b, 0
+	 sha1msg1	 MSG0b, MSG1b
+
+	;; Rounds 8-11
+	movdqu		MSG2, [INP + 2*16]
+	pshufb		MSG2, SHUF_MASK
+        DBGPRINT_XMM	MSG2
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+		sha1rnds4	ABCD, E0, 0
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+	 movdqu		 MSG2b, [INPb + 2*16]
+	 pshufb		 MSG2b, SHUF_MASK
+		 sha1nexte	 E0b, MSG2b
+		 movdqa		 E1b, ABCDb
+		 sha1rnds4	 ABCDb, E0b, 0
+	 sha1msg1	 MSG1b, MSG2b
+	 pxor		 MSG0b, MSG2b
+
+	;; Rounds 12-15
+	movdqu		MSG3, [INP + 3*16]
+	pshufb		MSG3, SHUF_MASK
+        DBGPRINT_XMM	MSG3
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 0
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+	 movdqu		 MSG3b, [INPb + 3*16]
+	 pshufb		 MSG3b, SHUF_MASK
+		 sha1nexte	 E1b, MSG3b
+		 movdqa		 E0b, ABCDb
+	 sha1msg2	 MSG0b, MSG3b
+		 sha1rnds4	 ABCDb, E1b, 0
+	 sha1msg1	 MSG2b, MSG3b
+	 pxor		 MSG1b, MSG3b
+
+
+	;; Rounds 16-19
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 0
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+		 sha1nexte	 E0b, MSG0b
+		 movdqa		 E1b, ABCDb
+	 sha1msg2	 MSG1b, MSG0b
+		 sha1rnds4	 ABCDb, E0b, 0
+	 sha1msg1	 MSG3b, MSG0b
+	 pxor		 MSG2b, MSG0b
+
+	;; Rounds 20-23
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+		 sha1nexte	 E1b, MSG1b
+		 movdqa		 E0b, ABCDb
+	 sha1msg2	 MSG2b, MSG1b
+		 sha1rnds4	 ABCDb, E1b, 1
+	 sha1msg1	 MSG0b, MSG1b
+	 pxor		 MSG3b, MSG1b
+
+	;; Rounds 24-27
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 1
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+		 sha1nexte	 E0b, MSG2b
+		 movdqa		 E1b, ABCDb
+	 sha1msg2	 MSG3b, MSG2b
+		 sha1rnds4	 ABCDb, E0b, 1
+	 sha1msg1	 MSG1b, MSG2b
+	 pxor		 MSG0b, MSG2b
+
+	;; Rounds 28-31
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+		 sha1nexte	 E1b, MSG3b
+		 movdqa		 E0b, ABCDb
+	 sha1msg2	 MSG0b, MSG3b
+		 sha1rnds4	 ABCDb, E1b, 1
+	 sha1msg1	 MSG2b, MSG3b
+	 pxor		 MSG1b, MSG3b
+
+	;; Rounds 32-35
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 1
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+		 sha1nexte	 E0b, MSG0b
+		 movdqa		 E1b, ABCDb
+	 sha1msg2	 MSG1b, MSG0b
+		 sha1rnds4	 ABCDb, E0b, 1
+	 sha1msg1	 MSG3b, MSG0b
+	 pxor		 MSG2b, MSG0b
+
+	;; Rounds 36-39
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+		 sha1nexte	 E1b, MSG1b
+		 movdqa		 E0b, ABCDb
+	 sha1msg2	 MSG2b, MSG1b
+		 sha1rnds4	 ABCDb, E1b, 1
+	 sha1msg1	 MSG0b, MSG1b
+	 pxor		 MSG3b, MSG1b
+
+	;; Rounds 40-43
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+		 sha1nexte	 E0b, MSG2b
+		 movdqa		 E1b, ABCDb
+	 sha1msg2	 MSG3b, MSG2b
+		 sha1rnds4	 ABCDb, E0b, 2
+	 sha1msg1	 MSG1b, MSG2b
+	 pxor		 MSG0b, MSG2b
+
+	;; Rounds 44-47
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 2
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+		 sha1nexte	 E1b, MSG3b
+		 movdqa		 E0b, ABCDb
+	 sha1msg2	 MSG0b, MSG3b
+		 sha1rnds4	 ABCDb, E1b, 2
+	 sha1msg1	 MSG2b, MSG3b
+	 pxor		 MSG1b, MSG3b
+
+	;; Rounds 48-51
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+		 sha1nexte	 E0b, MSG0b
+		 movdqa		 E1b, ABCDb
+	 sha1msg2	 MSG1b, MSG0b
+		 sha1rnds4	 ABCDb, E0b, 2
+	 sha1msg1	 MSG3b, MSG0b
+	 pxor		 MSG2b, MSG0b
+
+	;; Rounds 52-55
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 2
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+		 sha1nexte	 E1b, MSG1b
+		 movdqa		 E0b, ABCDb
+	 sha1msg2	 MSG2b, MSG1b
+		 sha1rnds4	 ABCDb, E1b, 2
+	 sha1msg1	 MSG0b, MSG1b
+	 pxor		 MSG3b, MSG1b
+
+	;; Rounds 56-59
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+		 sha1nexte	 E0b, MSG2b
+		 movdqa		 E1b, ABCDb
+	 sha1msg2	 MSG3b, MSG2b
+		 sha1rnds4	 ABCDb, E0b, 2
+	 sha1msg1	 MSG1b, MSG2b
+	 pxor		 MSG0b, MSG2b
+
+	;; Rounds 60-63
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 3
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+		 sha1nexte	 E1b, MSG3b
+		 movdqa		 E0b, ABCDb
+	 sha1msg2	 MSG0b, MSG3b
+		 sha1rnds4	 ABCDb, E1b, 3
+	 sha1msg1	 MSG2b, MSG3b
+	 pxor		 MSG1b, MSG3b
+
+	;; Rounds 64-67
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 3
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+		 sha1nexte	 E0b, MSG0b
+		 movdqa		 E1b, ABCDb
+	 sha1msg2	 MSG1b, MSG0b
+		 sha1rnds4	 ABCDb, E0b, 3
+	 sha1msg1	 MSG3b, MSG0b
+	 pxor		 MSG2b, MSG0b
+
+	;; Rounds 68-71
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 3
+	pxor		MSG3, MSG1
+		 sha1nexte	 E1b, MSG1b
+		 movdqa		 E0b, ABCDb
+	 sha1msg2	 MSG2b, MSG1b
+		 sha1rnds4	 ABCDb, E1b, 3
+	 pxor		 MSG3b, MSG1b
+
+	;; Rounds 72-75
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 3
+		 sha1nexte	 E0b, MSG2b
+		 movdqa		 E1b, ABCDb
+	 sha1msg2	 MSG3b, MSG2b
+		 sha1rnds4	 ABCDb, E0b, 3
+
+	;; Rounds 76-79
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+		sha1rnds4	ABCD, E1, 3
+		 sha1nexte	 E1b, MSG3b
+		 movdqa		 E0b, ABCDb
+		 sha1rnds4	 ABCDb, E1b, 3
+
+	;; Need to rotate E left by 30
+	movdqa		E1, E0
+	pslld		E0, 30
+	psrld		E1, 2
+	pxor		E0, E1
+	 movdqa		 E1b, E0b
+	 pslld		 E0b, 30
+	 psrld		 E1b, 2
+	 pxor		 E0b, E1b
+
+	paddd		ABCD, [rsp + frame.ABCD_SAVE]
+	paddd		E0,   [rsp + frame.E_SAVE]
+	 paddd		 ABCDb, [rsp + frame.ABCD_SAVEb]
+	 paddd		 E0b,   [rsp + frame.E_SAVEb]
+
+	add		INP, 64
+	 add		 INPb, 64
+	cmp		INP, NUM_BLKS
+	jne		loop0
+
+	;; write out digests
+	pshufd		ABCD, ABCD, 0x1B
+	movdqu		[args + 0*SHA1NI_DIGEST_ROW_SIZE], ABCD
+	pextrd		[args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0, 3
+        DBGPRINTL_XMM "jobA: digest out words[0-3]", ABCD
+        DBGPRINTL_XMM "jobA: digest out word 4", E0
+
+	 pshufd		 ABCDb, ABCDb, 0x1B
+	 movdqu		 [args + 1*SHA1NI_DIGEST_ROW_SIZE], ABCDb
+	 pextrd		 [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0b, 3
+
+	;; update input pointers
+	mov		[args + _data_ptr_sha1 + 0*PTR_SZ], INP
+	 mov		 [args + _data_ptr_sha1 + 1*PTR_SZ], INPb
+
+done_hash:
+
+        ;; Clear stack frame (4*16 bytes)
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+%assign i 0
+%rep 4
+        movdqa	[rsp + i*16], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+	add		rsp, frame_size
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm
new file mode 100644
index 000000000..9039660cc
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm
@@ -0,0 +1,512 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; SHA1 code, hybrid, rolled, interleaved
+; Uses SSE instructions
+%include "include/os.asm"
+
+section .data
+default rel
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19:                  ;ddq 0x5A8279995A8279995A8279995A827999
+	dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                  ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+	dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                  ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+	dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                  ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+	dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+%define	MOVDQ movdqu ;; assume buffers not aligned
+
+%ifdef LINUX
+%define INP	rdi ; 1st arg
+%define CTX     rsi ; 2nd arg
+%define REG3	edx
+%define REG4	ecx
+%else
+%define INP	rcx ; 1st arg
+%define CTX     rdx ; 2nd arg
+%define REG3	edi
+%define REG4	esi
+%endif
+
+%define FRAMESZ 3*16 + 1*8
+%define _RSP	FRAMESZ-1*8 + rsp
+
+%define a   eax
+%define b   ebx
+%define c   REG3
+%define d   REG4
+%define e   r8d
+%define T1  r9d
+%define f   r10d
+%define RND r11d
+%define g   r12d
+%define h   r13d
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XK    xmm2
+
+%xdefine X0 xmm3
+%xdefine X1 xmm4
+%xdefine X2 xmm5
+%xdefine X3 xmm6
+%xdefine X4 xmm7
+
+%define XFER  xmm8
+
+%define SZ 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X4
+%xdefine X4 X_
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T   ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+	mov  %%regF,%%regC
+	xor  %%regF,%%regD
+	and  %%regF,%%regB
+	xor  %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+	mov  %%regF,%%regD
+	xor  %%regF,%%regC
+	xor  %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T   ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+	mov  %%regF,%%regB
+	mov  %%regT,%%regB
+	or   %%regF,%%regC
+	and  %%regT,%%regC
+	and  %%regF,%%regD
+	or   %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+	MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+;; input is T1
+%macro ROUND 1
+%define %%MAGIC	%1
+	add   e,T1
+	mov   T1,a
+	rol   T1,5
+	add   e,T1
+	%%MAGIC     h,b,c,d,T1      ;; FUN  = MAGIC_Fi(B,C,D)
+	rol   b,30
+	add   h,e
+ROTATE_ARGS
+%endmacro
+
+%macro do_4i 1
+		movdqa	XFER, XK
+		paddd	XFER, X0
+		pextrd	T1, XFER, 0
+	;ROUND %1
+	add   e,T1
+		;SCHEDULE_4
+		movdqa	XTMP0, X1
+		palignr XTMP0, X0, 8		; XTMP0 = W[-14]
+	mov   T1,a
+		movdqa	XTMP1, X2
+	rol   T1,5
+		pxor	XTMP1, X0			; XTMP1 = W[-8] ^ W[-16]
+	add   e,T1
+		pxor	XTMP0, XTMP1		; XTMP0 = W[-8] ^ W[-14] ^ W[-16]
+	%1     h,b,c,d,T1      ;; FUN  = MAGIC_Fi(B,C,D)
+
+		;; Finish low half
+		movdqa	X4, X3
+	rol   b,30
+		psrldq	X4, 4			; X4 = W[-3] {xxBA}
+	add   h,e
+ROTATE_ARGS
+		pextrd	T1, XFER, 1
+	;ROUND %1
+	add   e,T1
+		pxor	X4, XTMP0			;
+	mov   T1,a
+		movdqa	XTMP1, X4
+	rol   T1,5
+		;; rotate X4 left 1
+		psrld	XTMP1, (32-1)
+	add   e,T1
+		pslld	X4, 1
+	%1     h,b,c,d,T1      ;; FUN  = MAGIC_Fi(B,C,D)
+		pxor	X4, XTMP1			; X4 = W[0] {xxBA}
+	rol   b,30
+	add   h,e
+ROTATE_ARGS
+		pextrd	T1, XFER, 2
+	;ROUND %1
+	add   e,T1
+		movdqa	XTMP1, X4
+	mov   T1,a
+
+		;; Finish high half
+		palignr XTMP1, X3, 4		; XTMP1 = w[-3] {DCxx}
+	rol   T1,5
+	add   e,T1
+		pxor	XTMP0, XTMP1
+	%1     h,b,c,d,T1      ;; FUN  = MAGIC_Fi(B,C,D)
+		;; rotate XTMP0 left 1
+		movdqa	XTMP1, XTMP0
+		psrld	XTMP1, (32-1)
+	rol   b,30
+	add   h,e
+ROTATE_ARGS
+		pextrd	T1, XFER, 3
+	;ROUND %1
+	add   e,T1
+	mov   T1,a
+		pslld	XTMP0, 1
+	rol   T1,5
+	add   e,T1
+		pxor	XTMP0, XTMP1		; XTMP0 = W[0] {DCxx}
+	%1     h,b,c,d,T1      ;; FUN  = MAGIC_Fi(B,C,D)
+		;; COMBINE HALVES
+		shufps	X4, XTMP0, 11100100b	; X4 = X[0] {DCBA}
+	rol   b,30
+	add   h,e
+
+		rotate_Xs
+ROTATE_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha1_block_sse(void *input_data, UINT32 digest[5])
+;; arg 1 : (in) pointer to one block of data
+;; arg 2 : (in/out) pointer to read/write digest
+MKGLOBAL(sha1_block_sse,function,internal)
+align 32
+sha1_block_sse:
+	push	rbx
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+
+	movdqa	XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+%ifndef LINUX
+	mov	rax, rsp		; copy rsp
+	sub	rsp, FRAMESZ
+	and	rsp, -16		; align stack frame
+	mov	[_RSP],rax		; save copy of rsp
+	movdqa	[rsp + 0 * 16], xmm6
+	movdqa	[rsp + 1 * 16], xmm7
+	movdqa	[rsp + 2 * 16], xmm8
+
+%endif
+	MOVDQ	X0, [INP + 0*16]
+	MOVDQ	X1, [INP + 1*16]
+
+	;; load next message block
+	MOVDQ	X2, [INP + 2*16]
+	MOVDQ	X3, [INP + 3*16]
+
+        ;; set up a-f based on h0-h4
+	;; byte swap first 16 dwords
+	mov	a, [SZ*0 + CTX]
+	pshufb	X0, XTMP0
+	mov	b, [SZ*1 + CTX]
+	pshufb	X1, XTMP0
+	mov	c, [SZ*2 + CTX]
+	pshufb	X2, XTMP0
+	mov	d, [SZ*3 + CTX]
+	pshufb	X3, XTMP0
+	mov	e, [SZ*4 + CTX]
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	;; do rounds 00-19
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa	XK, [rel K00_19]
+	mov	RND, 3
+	ROTATE_ARGS
+	ROTATE_ARGS
+	ROTATE_ARGS
+	ROTATE_ARGS
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	jmp	loop1_5
+align 16
+loop1:
+
+	do_4i	MAGIC_F0
+
+loop1_5:
+	do_4i	MAGIC_F0
+
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	movdqa	X0, X2
+	movdqa	X2, X4
+	movdqa	X4, X1
+	movdqa	X1, X3
+
+	sub	RND, 1
+	jne	loop1
+
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	;; end rounds 00-19
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	;; do rounds 20-39
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa	XK, [rel K20_39]
+	mov	RND, 3
+	ROTATE_ARGS
+	ROTATE_ARGS
+	ROTATE_ARGS
+	ROTATE_ARGS
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	jmp	loop2_5
+align 16
+loop2:
+
+	do_4i	MAGIC_F1
+
+loop2_5:
+	do_4i	MAGIC_F1
+
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	movdqa	X0, X2
+	movdqa	X2, X4
+	movdqa	X4, X1
+	movdqa	X1, X3
+
+	sub	RND, 1
+	jne	loop2
+
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	;; end rounds 20-39
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	;; do rounds 40-59
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa	XK, [rel K40_59]
+	mov	RND, 3
+	ROTATE_ARGS
+	ROTATE_ARGS
+	ROTATE_ARGS
+	ROTATE_ARGS
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	jmp	loop3_5
+align 16
+loop3:
+
+	do_4i	MAGIC_F2
+
+loop3_5:
+	do_4i	MAGIC_F2
+
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	movdqa	X0, X2
+	movdqa	X2, X4
+	movdqa	X4, X1
+	movdqa	X1, X3
+
+	sub	RND, 1
+	jne	loop3
+
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+	rotate_Xs
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	;; end rounds 40-59
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	;; do rounds 60-79
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa	XK, [rel K60_79]
+
+	do_4i	MAGIC_F3
+
+	movdqa	XFER, XK
+	paddd	XFER, X0
+	pextrd	T1, XFER, 0
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 1
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 2
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 3
+	ROUND MAGIC_F3
+
+	movdqa	XFER, XK
+	paddd	XFER, X1
+	pextrd	T1, XFER, 0
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 1
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 2
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 3
+	ROUND MAGIC_F3
+
+	movdqa	XFER, XK
+	paddd	XFER, X2
+	pextrd	T1, XFER, 0
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 1
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 2
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 3
+	ROUND MAGIC_F3
+
+	movdqa	XFER, XK
+	paddd	XFER, X3
+	pextrd	T1, XFER, 0
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 1
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 2
+	ROUND MAGIC_F3
+	pextrd	T1, XFER, 3
+	ROUND MAGIC_F3
+
+        ;; update result digest h0-h4
+	add	[SZ*0 + CTX], a
+	add	[SZ*1 + CTX], b
+	add	[SZ*2 + CTX], c
+	add	[SZ*3 + CTX], d
+	add	[SZ*4 + CTX], e
+
+%ifndef LINUX
+	movdqa	xmm8, [rsp + 2 * 16]
+	movdqa	xmm7, [rsp + 1 * 16]
+	movdqa	xmm6, [rsp + 0 * 16]
+
+%ifdef SAFE_DATA
+        ;; Clear potential sensitive data stored in stack
+        pxor    xmm0, xmm0
+        movdqa  [rsp + 0 * 16], xmm0
+        movdqa  [rsp + 1 * 16], xmm0
+        movdqa  [rsp + 2 * 16], xmm0
+%endif
+
+	mov	rsp, [_RSP]
+%endif ;; LINUX
+
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbx
+
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm
new file mode 100644
index 000000000..f0914d799
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm
@@ -0,0 +1,33 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNC sha224_block_sse
+
+%include "sse/sha256_one_block_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm
new file mode 100644
index 000000000..fa593defa
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm
@@ -0,0 +1,614 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers:		RAX RBX RCX RDX RBP RSI RDI R8  R9  R10 R11 R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Windows clobbers:	        RCX RDX     RSI RDI             R11
+;; Windows preserves:	RAX RBX         RBP         R8  R9  R10     R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;; Linux clobbers:	        RCX RDX     RSI RDI             R11
+;; Linux preserves:	RAX RBX         RBP         R8  R9  R10     R12 R13 R14 R15
+;;			-----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%include "mb_mgr_datastruct.asm"
+
+; resdq = res0 => 16 bytes
+struc frame
+.ABEF_SAVE	reso	1
+.CDGH_SAVE	reso	1
+.ABEF_SAVEb	reso	1
+.CDGH_SAVEb	reso	1
+.align		resq	1
+endstruc
+
+%ifdef LINUX
+%define arg1	rdi
+%define arg2	rsi
+%define arg3	rcx
+%define arg4	rdx
+%else
+%define arg1	rcx
+%define arg2	rdx
+%define arg3	rdi
+%define arg4	rsi
+%endif
+
+%define args            arg1
+%define NUM_BLKS 	arg2
+
+%define INP		arg3
+%define INPb		arg4
+
+
+%define SHA256CONSTANTS	r11
+
+;; MSG MUST be xmm0 (implicit argument)
+%define MSG		xmm0
+%define STATE0		xmm1
+%define STATE1		xmm2
+%define MSGTMP0		xmm3
+%define MSGTMP1		xmm4
+%define MSGTMP2		xmm5
+%define MSGTMP3		xmm6
+%define MSGTMP4		xmm7
+
+%define STATE0b		xmm8
+%define STATE1b		xmm9
+%define MSGTMP0b	xmm10
+%define MSGTMP1b	xmm11
+%define MSGTMP2b	xmm12
+%define MSGTMP3b	xmm13
+%define MSGTMP		xmm14
+
+%define SHUF_MASK	xmm15
+
+section .data
+default rel
+align 64
+K256:
+	dd	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	dd	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	dd	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	dd	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	dd	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	dd	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	dd	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	dd	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	dd	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	dd	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	dd	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	dd	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	dd	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	dd	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	dd	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	dd	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_ni(SHA256_ARGS *args, UINT32 size_in_blocks)
+;; arg1 : pointer to args
+;; arg2 : size (in blocks) ;; assumed to be >= 1
+section .text
+MKGLOBAL(sha256_ni,function,internal)
+align 32
+sha256_ni:
+	sub		rsp, frame_size
+
+        DBGPRINTL "enter sha256-ni-x2"
+
+	shl		NUM_BLKS, 6	; convert to bytes
+	jz		done_hash
+
+        DBGPRINTL64	"jobA/B byte size:", NUM_BLKS
+
+	;; load input pointers
+	mov		INP, [args + _data_ptr_sha256 + 0*PTR_SZ]
+	mov		INPb, [args + _data_ptr_sha256 + 1*PTR_SZ]
+
+	add		NUM_BLKS, INP	; pointer to end of data
+
+	;; load initial digest
+	;; Probably need to reorder these appropriately
+	;; DCBA, HGFE -> ABEF, CDGH
+
+	movdqu		STATE0, [args + 0*SHA256NI_DIGEST_ROW_SIZE]
+	movdqu		STATE1,	[args + 0*SHA256NI_DIGEST_ROW_SIZE + 16]
+	 movdqu		 STATE0b, [args + 1*SHA256NI_DIGEST_ROW_SIZE]
+	 movdqu		 STATE1b, [args + 1*SHA256NI_DIGEST_ROW_SIZE + 16]
+        DBGPRINTL	"jobA digest in:"
+	DBGPRINT_XMM	STATE0
+	DBGPRINT_XMM	STATE1
+        DBGPRINTL	"jobB digest in:"
+	DBGPRINT_XMM	STATE0b
+	DBGPRINT_XMM	STATE1b
+
+	pshufd		STATE0, STATE0, 0xB1	; CDAB
+	pshufd		STATE1, STATE1, 0x1B	; EFGH
+	movdqa		MSGTMP4, STATE0
+	 pshufd		 STATE0b, STATE0b, 0xB1	; CDAB
+	 pshufd		 STATE1b, STATE1b, 0x1B	; EFGH
+	 movdqa		 MSGTMP, STATE0b
+	palignr		STATE0, STATE1, 8	; ABEF
+	 palignr	 STATE0b, STATE1b, 8	; ABEF
+	pblendw		STATE1, MSGTMP4, 0xF0	; CDGH
+	 pblendw	 STATE1b, MSGTMP, 0xF0	; CDGH
+
+	lea		SHA256CONSTANTS,[rel K256]
+	movdqa		SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+%ifdef DO_DBGPRINT
+	;;	prin buffer A
+	push		r10
+	push		NUM_BLKS
+	DBGPRINTL 	"jobA data:"
+	xor		r10, r10
+	sub		NUM_BLKS, INP
+.loop_dbgA:
+	movdqu		MSG, [INP + r10 + 0*16]
+        DBGPRINT_XMM	MSG
+	movdqu		MSG, [INP + r10 + 1*16]
+        DBGPRINT_XMM	MSG
+	movdqu		MSG, [INP + r10 + 2*16]
+        DBGPRINT_XMM	MSG
+	movdqu		MSG, [INP + r10 + 3*16]
+        DBGPRINT_XMM	MSG
+	add		r10, 64
+	cmp		NUM_BLKS, r10
+	jne		.loop_dbgA
+	pop		NUM_BLKS
+	pop		r10
+%endif
+
+%ifdef DO_DBGPRINT
+	;;	prin buffer B
+	push		r10
+	push		NUM_BLKS
+	DBGPRINTL 	"jobB data:"
+	xor		r10, r10
+	sub		NUM_BLKS, INP
+.loop_dbgB:
+	movdqu		MSG, [INPb + r10 + 0*16]
+        DBGPRINT_XMM	MSG
+	movdqu		MSG, [INPb + r10 + 1*16]
+        DBGPRINT_XMM	MSG
+	movdqu		MSG, [INPb + r10 + 2*16]
+        DBGPRINT_XMM	MSG
+	movdqu		MSG, [INPb + r10 + 3*16]
+        DBGPRINT_XMM	MSG
+	add		r10, 64
+	cmp		NUM_BLKS, r10
+	jne		.loop_dbgB
+	pop		NUM_BLKS
+	pop		r10
+%endif
+
+.loop0:
+	;; Save digests
+	movdqa		[rsp + frame.ABEF_SAVE], STATE0
+	movdqa		[rsp + frame.CDGH_SAVE], STATE1
+	movdqa		 [rsp + frame.ABEF_SAVEb], STATE0b
+	movdqa		 [rsp + frame.CDGH_SAVEb], STATE1b
+
+	;; Rounds 0-3
+	movdqu		MSG, [INP + 0*16]
+	pshufb		MSG, SHUF_MASK
+	movdqa		MSGTMP0, MSG
+		paddd		MSG, [SHA256CONSTANTS + 0*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqu		 MSG, [INPb + 0*16]
+	 pshufb		 MSG, SHUF_MASK
+	 movdqa		 MSGTMP0b, MSG
+		 paddd		 MSG, [SHA256CONSTANTS + 0*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+
+	;; Rounds 4-7
+	movdqu		MSG, [INP + 1*16]
+	pshufb		MSG, SHUF_MASK
+	movdqa		MSGTMP1, MSG
+		paddd		MSG, [SHA256CONSTANTS + 1*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqu		 MSG, [INPb + 1*16]
+	 pshufb		 MSG, SHUF_MASK
+	 movdqa		 MSGTMP1b, MSG
+		 paddd		 MSG, [SHA256CONSTANTS + 1*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP0, MSGTMP1
+	 sha256msg1	 MSGTMP0b, MSGTMP1b
+
+	;; Rounds 8-11
+	movdqu		MSG, [INP + 2*16]
+	pshufb		MSG, SHUF_MASK
+	movdqa		MSGTMP2, MSG
+		paddd		MSG, [SHA256CONSTANTS + 2*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqu		 MSG, [INPb + 2*16]
+	 pshufb		 MSG, SHUF_MASK
+	 movdqa		 MSGTMP2b, MSG
+		 paddd		 MSG, [SHA256CONSTANTS + 2*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP1, MSGTMP2
+	 sha256msg1	 MSGTMP1b, MSGTMP2b
+
+	;; Rounds 12-15
+	movdqu		MSG, [INP + 3*16]
+	pshufb		MSG, SHUF_MASK
+	movdqa		MSGTMP3, MSG
+		paddd		MSG, [SHA256CONSTANTS + 3*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP3
+	palignr		MSGTMP, MSGTMP2, 4
+	paddd		MSGTMP0, MSGTMP
+	sha256msg2	MSGTMP0, MSGTMP3
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqu		 MSG, [INPb + 3*16]
+	 pshufb		 MSG, SHUF_MASK
+	 movdqa		 MSGTMP3b, MSG
+		 paddd		 MSG, [SHA256CONSTANTS + 3*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP3b
+	 palignr	 MSGTMP, MSGTMP2b, 4
+	 paddd		 MSGTMP0b, MSGTMP
+	 sha256msg2	 MSGTMP0b, MSGTMP3b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP2, MSGTMP3
+	 sha256msg1	 MSGTMP2b, MSGTMP3b
+
+	;; Rounds 16-19
+	movdqa		MSG, MSGTMP0
+		paddd		MSG, [SHA256CONSTANTS + 4*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP0
+	palignr		MSGTMP, MSGTMP3, 4
+	paddd		MSGTMP1, MSGTMP
+	sha256msg2	MSGTMP1, MSGTMP0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP0b
+		 paddd		 MSG, [SHA256CONSTANTS + 4*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP0b
+	 palignr	 MSGTMP, MSGTMP3b, 4
+	 paddd		 MSGTMP1b, MSGTMP
+	 sha256msg2	 MSGTMP1b, MSGTMP0b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP3, MSGTMP0
+	 sha256msg1	 MSGTMP3b, MSGTMP0b
+
+	;; Rounds 20-23
+	movdqa		MSG, MSGTMP1
+		paddd		MSG, [SHA256CONSTANTS + 5*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP1
+	palignr		MSGTMP, MSGTMP0, 4
+	paddd		MSGTMP2, MSGTMP
+	sha256msg2	MSGTMP2, MSGTMP1
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP1b
+		 paddd		 MSG, [SHA256CONSTANTS + 5*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP1b
+	 palignr	 MSGTMP, MSGTMP0b, 4
+	 paddd		 MSGTMP2b, MSGTMP
+	 sha256msg2	 MSGTMP2b, MSGTMP1b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP0, MSGTMP1
+	 sha256msg1	 MSGTMP0b, MSGTMP1b
+
+	;; Rounds 24-27
+	movdqa		MSG, MSGTMP2
+		paddd		MSG, [SHA256CONSTANTS + 6*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP2
+	palignr		MSGTMP, MSGTMP1, 4
+	paddd		MSGTMP3, MSGTMP
+	sha256msg2	MSGTMP3, MSGTMP2
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP2b
+		 paddd		 MSG, [SHA256CONSTANTS + 6*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP2b
+	 palignr	 MSGTMP, MSGTMP1b, 4
+	 paddd		 MSGTMP3b, MSGTMP
+	 sha256msg2	 MSGTMP3b, MSGTMP2b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP1, MSGTMP2
+	 sha256msg1	 MSGTMP1b, MSGTMP2b
+
+	;; Rounds 28-31
+	movdqa		MSG, MSGTMP3
+		paddd		MSG, [SHA256CONSTANTS + 7*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP3
+	palignr		MSGTMP, MSGTMP2, 4
+	paddd		MSGTMP0, MSGTMP
+	sha256msg2	MSGTMP0, MSGTMP3
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP3b
+		 paddd		 MSG, [SHA256CONSTANTS + 7*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP3b
+	 palignr	 MSGTMP, MSGTMP2b, 4
+	 paddd		 MSGTMP0b, MSGTMP
+	 sha256msg2	 MSGTMP0b, MSGTMP3b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP2, MSGTMP3
+	 sha256msg1	 MSGTMP2b, MSGTMP3b
+
+	;; Rounds 32-35
+	movdqa		MSG, MSGTMP0
+		paddd		MSG, [SHA256CONSTANTS + 8*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP0
+	palignr		MSGTMP, MSGTMP3, 4
+	paddd		MSGTMP1, MSGTMP
+	sha256msg2	MSGTMP1, MSGTMP0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP0b
+		 paddd		 MSG, [SHA256CONSTANTS + 8*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP0b
+	 palignr	 MSGTMP, MSGTMP3b, 4
+	 paddd		 MSGTMP1b, MSGTMP
+	 sha256msg2	 MSGTMP1b, MSGTMP0b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP3, MSGTMP0
+	 sha256msg1	 MSGTMP3b, MSGTMP0b
+
+	;; Rounds 36-39
+	movdqa		MSG, MSGTMP1
+		paddd		MSG, [SHA256CONSTANTS + 9*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP1
+	palignr		MSGTMP, MSGTMP0, 4
+	paddd		MSGTMP2, MSGTMP
+	sha256msg2	MSGTMP2, MSGTMP1
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP1b
+		 paddd		 MSG, [SHA256CONSTANTS + 9*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP1b
+	 palignr	 MSGTMP, MSGTMP0b, 4
+	 paddd		 MSGTMP2b, MSGTMP
+	 sha256msg2	 MSGTMP2b, MSGTMP1b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP0, MSGTMP1
+	 sha256msg1	 MSGTMP0b, MSGTMP1b
+
+	;; Rounds 40-43
+	movdqa		MSG, MSGTMP2
+		paddd		MSG, [SHA256CONSTANTS + 10*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP2
+	palignr		MSGTMP, MSGTMP1, 4
+	paddd		MSGTMP3, MSGTMP
+	sha256msg2	MSGTMP3, MSGTMP2
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP2b
+		 paddd		 MSG, [SHA256CONSTANTS + 10*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP2b
+	 palignr	 MSGTMP, MSGTMP1b, 4
+	 paddd		 MSGTMP3b, MSGTMP
+	 sha256msg2	 MSGTMP3b, MSGTMP2b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP1, MSGTMP2
+	 sha256msg1	 MSGTMP1b, MSGTMP2b
+
+	;; Rounds 44-47
+	movdqa		MSG, MSGTMP3
+		paddd		MSG, [SHA256CONSTANTS + 11*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP3
+	palignr		MSGTMP, MSGTMP2, 4
+	paddd		MSGTMP0, MSGTMP
+	sha256msg2	MSGTMP0, MSGTMP3
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP3b
+		 paddd		 MSG, [SHA256CONSTANTS + 11*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP3b
+	 palignr	 MSGTMP, MSGTMP2b, 4
+	 paddd		 MSGTMP0b, MSGTMP
+	 sha256msg2	 MSGTMP0b, MSGTMP3b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP2, MSGTMP3
+	 sha256msg1	 MSGTMP2b, MSGTMP3b
+
+	;; Rounds 48-51
+	movdqa		MSG, MSGTMP0
+		paddd		MSG, [SHA256CONSTANTS + 12*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP0
+	palignr		MSGTMP, MSGTMP3, 4
+	paddd		MSGTMP1, MSGTMP
+	sha256msg2	MSGTMP1, MSGTMP0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP0b
+		 paddd		 MSG, [SHA256CONSTANTS + 12*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP0b
+	 palignr	 MSGTMP, MSGTMP3b, 4
+	 paddd		 MSGTMP1b, MSGTMP
+	 sha256msg2	 MSGTMP1b, MSGTMP0b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+	sha256msg1	MSGTMP3, MSGTMP0
+	 sha256msg1	 MSGTMP3b, MSGTMP0b
+
+	;; Rounds 52-55
+	movdqa		MSG, MSGTMP1
+		paddd		MSG, [SHA256CONSTANTS + 13*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP1
+	palignr		MSGTMP, MSGTMP0, 4
+	paddd		MSGTMP2, MSGTMP
+	sha256msg2	MSGTMP2, MSGTMP1
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP1b
+		 paddd		 MSG, [SHA256CONSTANTS + 13*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP1b
+	 palignr	 MSGTMP, MSGTMP0b, 4
+	 paddd		 MSGTMP2b, MSGTMP
+	 sha256msg2	 MSGTMP2b, MSGTMP1b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+
+	;; Rounds 56-59
+	movdqa		MSG, MSGTMP2
+		paddd		MSG, [SHA256CONSTANTS + 14*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+	movdqa		MSGTMP, MSGTMP2
+	palignr		MSGTMP, MSGTMP1, 4
+	paddd		MSGTMP3, MSGTMP
+	sha256msg2	MSGTMP3, MSGTMP2
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP2b
+		 paddd		 MSG, [SHA256CONSTANTS + 14*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+	 movdqa		 MSGTMP, MSGTMP2b
+	 palignr	 MSGTMP, MSGTMP1b, 4
+	 paddd		 MSGTMP3b, MSGTMP
+	 sha256msg2	 MSGTMP3b, MSGTMP2b
+		 pshufd		 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+
+	;; Rounds 60-63
+	movdqa		MSG, MSGTMP3
+		paddd		MSG, [SHA256CONSTANTS + 15*16]
+		sha256rnds2	STATE1, STATE0, MSG	; MSG is implicit argument
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1, MSG	; MSG is implicit argument
+	 movdqa		 MSG, MSGTMP3b
+		 paddd		 MSG, [SHA256CONSTANTS + 15*16]
+		 sha256rnds2	 STATE1b, STATE0b, MSG	; MSG is implicit argument
+		 pshufd 	 MSG, MSG, 0x0E
+		 sha256rnds2	 STATE0b, STATE1b, MSG	; MSG is implicit argument
+
+	paddd		STATE0, [rsp + frame.ABEF_SAVE]
+	paddd		STATE1, [rsp + frame.CDGH_SAVE]
+	 paddd		 STATE0b, [rsp + frame.ABEF_SAVEb]
+	 paddd		 STATE1b, [rsp + frame.CDGH_SAVEb]
+
+	add		INP, 64
+	 add		 INPb, 64
+	cmp		INP, NUM_BLKS
+	jne		.loop0
+
+	;; update data pointers
+	mov		[args + _data_ptr_sha256 + 0*PTR_SZ], INP
+	mov		 [args + _data_ptr_sha256 + 1*PTR_SZ], INPb
+
+	; Reorder for writeback
+	pshufd		STATE0, STATE0, 0x1B	; FEBA
+	pshufd		STATE1, STATE1, 0xB1	; DCHG
+	movdqa		MSGTMP4, STATE0
+	 pshufd		 STATE0b, STATE0b, 0x1B	; FEBA
+	 pshufd		 STATE1b, STATE1b, 0xB1	; DCHG
+	 movdqa		 MSGTMP, STATE0b
+	pblendw		STATE0, STATE1,  0xF0	; DCBA
+	 pblendw	 STATE0b, STATE1b,  0xF0 ; DCBA
+	palignr		STATE1, MSGTMP4,  8	; HGFE
+	 palignr	 STATE1b, MSGTMP,  8	; HGFE
+
+	;; update digests
+	movdqu		[args + 0*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0
+	movdqu		[args + 0*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1
+	 movdqu		 [args + 1*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0b
+	 movdqu		 [args + 1*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1b
+
+        DBGPRINTL	"jobA digest out:"
+	DBGPRINT_XMM	STATE0
+	DBGPRINT_XMM	STATE1
+        DBGPRINTL	"jobB digest out:"
+	DBGPRINT_XMM	STATE0b
+	DBGPRINT_XMM	STATE1b
+
+done_hash:
+        DBGPRINTL	"exit sha256-ni-x2"
+
+        ;; Clear stack frame (4*16 bytes)
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+%assign i 0
+%rep 4
+        movdqa	[rsp + i*16], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+	add		rsp, frame_size
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm
new file mode 100644
index 000000000..8869c14ef
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm
@@ -0,0 +1,512 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "include/os.asm"
+
+section .data
+default rel
+align 64
+K256:
+	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA:              ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
+	dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
+; shuffle xDxC -> DC00
+_SHUF_DC00:              ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
+	dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
+
+section .text
+
+
+%define	MOVDQ movdqu ;; assume buffers not aligned
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+	MOVDQ %1, %2
+	pshufb %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XTMP4 xmm8
+%define XFER  xmm9
+
+%define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00	xmm11 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK	xmm12
+
+%ifdef LINUX
+%define CTX	rsi	; 2nd arg
+%define INP	rdi	; 1st arg
+
+%define SRND	rdi	; clobbers INP
+%define c	ecx
+%define d 	r8d
+%define e 	edx
+%else
+%define CTX	rdx 	; 2nd arg
+%define INP	rcx 	; 1st arg
+
+%define SRND	rcx	; clobbers INP
+%define c 	edi
+%define d	esi
+%define e 	r8d
+
+%endif
+%define TBL	rbp
+%define a eax
+%define b ebx
+
+%define f r9d
+%define g r10d
+%define h r11d
+
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+struc STACK
+%ifndef LINUX
+_XMM_SAVE:	reso	7
+%endif
+_XFER:		reso	1
+endstruc
+
+%ifndef FUNC
+%define FUNC sha256_block_sse
+%endif
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+		;; compute s0 four at a time and s1 two at a time
+		;; compute W[-16] + W[-7] 4 at a time
+		movdqa	XTMP0, X3
+	mov	y0, e		; y0 = e
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+		palignr	XTMP0, X2, 4	; XTMP0 = W[-7]
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+		movdqa	XTMP1, X1
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	xor	y2, g		; y2 = f^g
+		paddd	XTMP0, X0	; XTMP0 = W[-7] + W[-16]
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+		;; compute s0
+		palignr	XTMP1, X0, 4	; XTMP1 = W[-15]
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+		movdqa	XTMP2, XTMP1	; XTMP2 = W[-15]
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
+		movdqa	XTMP3, XTMP1	; XTMP3 = W[-15]
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		pslld	XTMP1, (32-7)
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		psrld	XTMP2, 7
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		por	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		movdqa	XTMP2, XTMP3	; XTMP2 = W[-15]
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+		movdqa	XTMP4, XTMP3	; XTMP4 = W[-15]
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+		pslld	XTMP3, (32-18)
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor	y2, g		; y2 = f^g
+		psrld	XTMP2, 18
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+		pxor	XTMP1, XTMP3
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+		psrld	XTMP4, 3	; XTMP4 = W[-15] >> 3
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+		pxor	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		pxor	XTMP1, XTMP4	; XTMP1 = s0
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		;; compute low s1
+		pshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		paddd	XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+		movdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+		psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xBxA}
+	xor	y2, g		; y2 = f^g
+		psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xBxA}
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+		psrld	XTMP4, 10	; XTMP4 = W[-2] >> 10 {BBAA}
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+		pxor	XTMP2, XTMP3
+	add	y2, y0		; y2 = S1 + CH
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
+		pxor	XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		pshufb	XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		paddd	XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		;; compute high s1
+		pshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
+	mov	y0, e		; y0 = e
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+		movdqa	X0,    XTMP2	; X0    = W[-2] {DDCC}
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+		psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xDxC}
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	xor	y2, g		; y2 = f^g
+		psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xDxC}
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+		psrld	X0,    10	; X0 = W[-2] >> 10 {DDCC}
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+		pxor	XTMP2, XTMP3
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
+		pxor	X0, XTMP2	; X0 = s1 {xDxC}
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		pshufb	X0, SHUF_DC00	; X0 = s1 {DC00}
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		paddd	X0, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+	mov	y0, e		; y0 = e
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor	y2, g		; y2 = f^g
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and	y2, e		; y2 = (f^g)&e
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	add	y2, y0		; y2 = S1 + CH
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void FUNC(void *input_data, UINT32 digest[8])
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+section .text
+MKGLOBAL(FUNC,function,internal)
+align 32
+FUNC:
+	push	rbx
+%ifndef LINUX
+	push	rsi
+	push	rdi
+%endif
+	push	rbp
+	push	r13
+	push	r14
+	push	r15
+
+	sub	rsp,STACK_size
+%ifndef LINUX
+	movdqa	[rsp + _XMM_SAVE + 0*16],xmm6
+	movdqa	[rsp + _XMM_SAVE + 1*16],xmm7
+	movdqa	[rsp + _XMM_SAVE + 2*16],xmm8
+	movdqa	[rsp + _XMM_SAVE + 3*16],xmm9
+	movdqa	[rsp + _XMM_SAVE + 4*16],xmm10
+	movdqa	[rsp + _XMM_SAVE + 5*16],xmm11
+	movdqa	[rsp + _XMM_SAVE + 6*16],xmm12
+%endif
+
+	;; load initial digest
+	mov	a, [4*0 + CTX]
+	mov	b, [4*1 + CTX]
+	mov	c, [4*2 + CTX]
+	mov	d, [4*3 + CTX]
+	mov	e, [4*4 + CTX]
+	mov	f, [4*5 + CTX]
+	mov	g, [4*6 + CTX]
+	mov	h, [4*7 + CTX]
+
+	movdqa	BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+	movdqa	SHUF_00BA, [rel _SHUF_00BA]
+	movdqa	SHUF_DC00, [rel _SHUF_DC00]
+
+	lea	TBL,[rel K256]
+
+	;; byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
+
+	;; schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov	SRND, 3
+align 16
+loop1:
+	movdqa	XFER, [TBL + 0*16]
+	paddd	XFER, X0
+	movdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa	XFER, [TBL + 1*16]
+	paddd	XFER, X0
+	movdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa	XFER, [TBL + 2*16]
+	paddd	XFER, X0
+	movdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa	XFER, [TBL + 3*16]
+	paddd	XFER, X0
+	movdqa	[rsp + _XFER], XFER
+	add	TBL, 4*16
+	FOUR_ROUNDS_AND_SCHED
+
+	sub	SRND, 1
+	jne	loop1
+
+	mov	SRND, 2
+loop2:
+	paddd	X0, [TBL + 0*16]
+	movdqa	[rsp + _XFER], X0
+	DO_ROUND	0
+	DO_ROUND	1
+	DO_ROUND	2
+	DO_ROUND	3
+	paddd	X1, [TBL + 1*16]
+	movdqa	[rsp + _XFER], X1
+	add	TBL, 2*16
+	DO_ROUND	0
+	DO_ROUND	1
+	DO_ROUND	2
+	DO_ROUND	3
+
+	movdqa	X0, X2
+	movdqa	X1, X3
+
+	sub	SRND, 1
+	jne	loop2
+
+	add	[4*0 + CTX], a
+	add	[4*1 + CTX], b
+	add	[4*2 + CTX], c
+	add	[4*3 + CTX], d
+	add	[4*4 + CTX], e
+	add	[4*5 + CTX], f
+	add	[4*6 + CTX], g
+	add	[4*7 + CTX], h
+
+done_hash:
+%ifndef LINUX
+	movdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
+	movdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
+	movdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
+	movdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
+	movdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
+	movdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
+	movdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
+%ifdef SAFE_DATA
+        ;; Clear potential sensitive data stored in stack
+        pxor    xmm0, xmm0
+        movdqa  [rsp + _XMM_SAVE + 0 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 1 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 2 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 3 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 4 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 5 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 6 * 16], xmm0
+%endif
+%endif ;; LINUX
+
+	add	rsp, STACK_size
+
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	rbp
+%ifndef LINUX
+	pop	rdi
+	pop	rsi
+%endif
+	pop	rbx
+
+	ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm
new file mode 100644
index 000000000..c95f89d8f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm
@@ -0,0 +1,33 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNC sha384_block_sse
+
+%include "sse/sha512_one_block_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm
new file mode 100644
index 000000000..534cfbfd8
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm
@@ -0,0 +1,480 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "include/os.asm"
+
+%define	MOVDQ movdqu ;; assume buffers not aligned
+
+%ifndef FUNC
+%define FUNC sha512_block_sse
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+	MOVDQ %1, %2
+	pshufb %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+%define X4 xmm8
+%define X5 xmm9
+%define X6 xmm10
+%define X7 xmm11
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XFER  xmm13
+
+%define BYTE_FLIP_MASK	xmm12
+
+%ifdef LINUX
+%define CTX	rsi	; 2nd arg
+%define INP	rdi	; 1st arg
+
+%define SRND	rdi	; clobbers INP
+%define c	rcx
+%define d 	r8
+%define e 	rdx
+%else
+%define CTX	rdx 	; 2nd arg
+%define INP	rcx 	; 1st arg
+
+%define SRND	rcx	; clobbers INP
+%define c 	rdi
+%define d	rsi
+%define e 	r8
+
+%endif
+%define TBL	rbp
+%define a rax
+%define b rbx
+
+%define f r9
+%define g r10
+%define h r11
+
+%define y0 r13
+%define y1 r14
+%define y2 r15
+
+
+struc STACK
+%ifndef LINUX
+_XMM_SAVE:	reso	8
+%endif
+_XFER:		reso	1
+endstruc
+
+; rotate_Xs
+; Rotate values of symbols X0...X7
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X4
+%xdefine X4 X5
+%xdefine X5 X6
+%xdefine X6 X7
+%xdefine X7 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro TWO_ROUNDS_AND_SCHED 0
+
+		;; compute s0 four at a time and s1 two at a time
+		;; compute W[-16] + W[-7] 4 at a time
+		movdqa	XTMP0, X5
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+	ror	y0, (41-18)	; y0 = e >> (41-18)
+		palignr	XTMP0, X4, 8	; XTMP0 = W[-7]
+	xor	y0, e		; y0 = e ^ (e >> (41-18))
+	mov	y2, f		; y2 = f
+	ror	y1, (39-34)	; y1 = a >> (39-34)
+	xor	y1, a		; y1 = a ^ (a >> (39-34)
+		movdqa	XTMP1, X1
+	ror	y0, (18-14)	; y0 = (e >> (18-14)) ^ (e >> (41-14))
+	xor	y2, g		; y2 = f^g
+		paddq	XTMP0, X0	; XTMP0 = W[-7] + W[-16]
+	ror	y1, (34-28)	; y1 = (a >> (34-28)) ^ (a >> (39-28))
+	xor	y0, e		; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14))
+	and	y2, e		; y2 = (f^g)&e
+		;; compute s0
+		palignr	XTMP1, X0, 8	; XTMP1 = W[-15]
+	xor	y1, a		; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+		movdqa	XTMP2, XTMP1	; XTMP2 = W[-15]
+	ror	y0, 14		; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH
+	ror	y1, 28		; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+		movdqa	XTMP3, XTMP1	; XTMP3 = W[-15]
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+		psllq	XTMP1, (64-1)
+	mov	y2, a		; y2 = a
+	or	y0, c		; y0 = a|c
+		psrlq	XTMP2, 1
+	add	d, h		; d = d + t1
+	and	y2, c		; y2 = a&c
+		por	XTMP1, XTMP2	; XTMP1 = W[-15] ror 1
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = t1 + S0
+		movdqa	XTMP2, XTMP3	; XTMP2 = W[-15]
+		psrlq	XTMP2, 8
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = t1 + S0 + MAJ
+		movdqa	X0, XTMP3	; X0 = W[-15]
+		psllq	XTMP3, (64-8)
+
+
+ROTATE_ARGS
+		pxor	XTMP1, XTMP3
+		psrlq	X0, 7		; X0 = W[-15] >> 7
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+		pxor	XTMP1, XTMP2	; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8
+	ror	y0, (41-18)	; y0 = e >> (41-18)
+	xor	y0, e		; y0 = e ^ (e >> (41-18))
+	mov	y2, f		; y2 = f
+		pxor	XTMP1, X0	; XTMP1 = s0
+	ror	y1, (39-34)	; y1 = a >> (39-34)
+	xor	y1, a		; y1 = a ^ (a >> (39-34)
+		;; compute s1
+		movdqa	XTMP2, X7	; XTMP2 = W[-2]
+	ror	y0, (18-14)	; y0 = (e >> (18-14)) ^ (e >> (41-14))
+	xor	y2, g		; y2 = f^g
+		paddq	XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
+	ror	y1, (34-28)	; y1 = (a >> (34-28)) ^ (a >> (39-28))
+	xor	y0, e		; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14))
+		movdqa	XTMP3, XTMP2	; XTMP3 = W[-2]
+		movdqa	X0, XTMP2	; X0 = W[-2]
+	and	y2, e		; y2 = (f^g)&e
+	ror	y0, 14		; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+	xor	y1, a		; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+		psllq	XTMP3, (64-19)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH
+		psrlq	X0, 19
+	ror	y1, 28		; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+		por	XTMP3, X0	; XTMP3 = W[-2] ror 19
+	mov	y2, a		; y2 = a
+	or	y0, c		; y0 = a|c
+		movdqa	X0, XTMP2	; X0 = W[-2]
+		movdqa	XTMP1, XTMP2	; XTMP1 = W[-2]
+	add	d, h		; d = d + t1
+	and	y2, c		; y2 = a&c
+		psllq	X0, (64-61)
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = t1 + S0
+		psrlq	XTMP1, 61
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = t1 + S0 + MAJ
+		por	X0, XTMP1	; X0 = W[-2] ror 61
+		psrlq	XTMP2, 6	; XTMP2 = W[-2] >> 6
+		pxor	XTMP2, XTMP3
+		pxor	X0, XTMP2	; X0 = s1
+		paddq	X0, XTMP0	; X0 = {W[1], W[0]}
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 8]
+%macro DO_ROUND 1
+	mov	y0, e		; y0 = e
+	ror	y0, (41-18)	; y0 = e >> (41-18)
+	mov	y1, a		; y1 = a
+	xor	y0, e		; y0 = e ^ (e >> (41-18))
+	ror	y1, (39-34)	; y1 = a >> (39-34)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (39-34)
+	ror	y0, (18-14)	; y0 = (e >> (18-14)) ^ (e >> (41-14))
+	xor	y2, g		; y2 = f^g
+	xor	y0, e		; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6))
+	ror	y1, (34-28)	; y1 = (a >> (34-28)) ^ (a >> (39-28))
+	and	y2, e		; y2 = (f^g)&e
+	xor	y1, a		; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+	ror	y0, 14		; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	add	y2, y0		; y2 = S1 + CH
+	ror	y1, 28		; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+	add	y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + t1
+	and	y2, c		; y2 = a&c
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = t1 + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = t1 + S0 + MAJ
+	ROTATE_ARGS
+%endm
+
+section .data
+default rel
+align 64
+K512:
+	dq	0x428a2f98d728ae22,0x7137449123ef65cd
+	dq	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	dq	0x3956c25bf348b538,0x59f111f1b605d019
+	dq	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	dq	0xd807aa98a3030242,0x12835b0145706fbe
+	dq	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	dq	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	dq	0x9bdc06a725c71235,0xc19bf174cf692694
+	dq	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	dq	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	dq	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	dq	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	dq	0x983e5152ee66dfab,0xa831c66d2db43210
+	dq	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	dq	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	dq	0x06ca6351e003826f,0x142929670a0e6e70
+	dq	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	dq	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	dq	0x650a73548baf63de,0x766a0abb3c77b2a8
+	dq	0x81c2c92e47edaee6,0x92722c851482353b
+	dq	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	dq	0xc24b8b70d0f89791,0xc76c51a30654be30
+	dq	0xd192e819d6ef5218,0xd69906245565a910
+	dq	0xf40e35855771202a,0x106aa07032bbd1b8
+	dq	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	dq	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	dq	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	dq	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	dq	0x748f82ee5defb2fc,0x78a5636f43172f60
+	dq	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	dq	0x90befffa23631e28,0xa4506cebde82bde9
+	dq	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	dq	0xca273eceea26619c,0xd186b8c721c0c207
+	dq	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	dq	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	dq	0x113f9804bef90dae,0x1b710b35131c471b
+	dq	0x28db77f523047d84,0x32caab7b40c72493
+	dq	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	dq	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	dq	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+	dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void FUNC(void *input_data, UINT64 digest[8])
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+section .text
+MKGLOBAL(FUNC,function,internal)
+align 32
+FUNC:
+	push	rbx
+%ifndef LINUX
+	push	rsi
+	push	rdi
+%endif
+	push	rbp
+	push	r13
+	push	r14
+	push	r15
+
+	sub	rsp,STACK_size
+%ifndef LINUX
+	movdqa	[rsp + _XMM_SAVE + 0*16],xmm6
+	movdqa	[rsp + _XMM_SAVE + 1*16],xmm7
+	movdqa	[rsp + _XMM_SAVE + 2*16],xmm8
+	movdqa	[rsp + _XMM_SAVE + 3*16],xmm9
+	movdqa	[rsp + _XMM_SAVE + 4*16],xmm10
+	movdqa	[rsp + _XMM_SAVE + 5*16],xmm11
+	movdqa	[rsp + _XMM_SAVE + 6*16],xmm12
+	movdqa	[rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+
+	;; load initial digest
+	mov	a, [8*0 + CTX]
+	mov	b, [8*1 + CTX]
+	mov	c, [8*2 + CTX]
+	mov	d, [8*3 + CTX]
+	mov	e, [8*4 + CTX]
+	mov	f, [8*5 + CTX]
+	mov	g, [8*6 + CTX]
+	mov	h, [8*7 + CTX]
+
+	movdqa	BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+	lea	TBL,[rel K512]
+
+	;; byte swap first 16 qwords
+	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X4, [INP + 4*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X5, [INP + 5*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X6, [INP + 6*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X7, [INP + 7*16], BYTE_FLIP_MASK
+
+	;; schedule 64 input qwords, by doing 4 iterations of 16 rounds
+	mov	SRND, 4
+align 16
+loop1:
+
+%assign i 0
+%rep 7
+	movdqa	XFER, X0
+	paddq	XFER, [TBL + i*16]
+	movdqa	[rsp + _XFER], XFER
+	TWO_ROUNDS_AND_SCHED
+%assign i (i+1)
+%endrep
+
+	movdqa	XFER, X0
+	paddq	XFER, [TBL + 7*16]
+	movdqa	[rsp + _XFER], XFER
+	add	TBL, 8*16
+	TWO_ROUNDS_AND_SCHED
+
+	sub	SRND, 1
+	jne	loop1
+
+	mov	SRND, 2
+	jmp loop2a
+loop2:
+	movdqa	X0, X4
+	movdqa	X1, X5
+	movdqa	X2, X6
+	movdqa	X3, X7
+
+loop2a:
+	paddq	X0, [TBL + 0*16]
+	movdqa	[rsp + _XFER], X0
+	DO_ROUND 0
+	DO_ROUND 1
+
+	paddq	X1, [TBL + 1*16]
+	movdqa	[rsp + _XFER], X1
+	DO_ROUND 0
+	DO_ROUND 1
+
+	paddq	X2, [TBL + 2*16]
+	movdqa	[rsp + _XFER], X2
+	DO_ROUND 0
+	DO_ROUND 1
+
+	paddq	X3, [TBL + 3*16]
+	movdqa	[rsp + _XFER], X3
+	add	TBL, 4*16
+	DO_ROUND 0
+	DO_ROUND 1
+
+	sub	SRND, 1
+	jne	loop2
+
+	add	[8*0 + CTX], a
+	add	[8*1 + CTX], b
+	add	[8*2 + CTX], c
+	add	[8*3 + CTX], d
+	add	[8*4 + CTX], e
+	add	[8*5 + CTX], f
+	add	[8*6 + CTX], g
+	add	[8*7 + CTX], h
+
+done_hash:
+%ifndef LINUX
+	movdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
+	movdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
+	movdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
+	movdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
+	movdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
+	movdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
+	movdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
+	movdqa	xmm13,[rsp + _XMM_SAVE + 7*16]
+
+%ifdef SAFE_DATA
+        ;; Clear potential sensitive data stored in stack
+        pxor    xmm0, xmm0
+        movdqa  [rsp + _XMM_SAVE + 0 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 1 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 2 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 3 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 4 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 5 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 6 * 16], xmm0
+        movdqa  [rsp + _XMM_SAVE + 7 * 16], xmm0
+%endif
+%endif ;; LINUX
+
+	add	rsp, STACK_size
+
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	rbp
+%ifndef LINUX
+	pop	rdi
+	pop	rsi
+%endif
+	pop	rbx
+
+	ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm
new file mode 100644
index 000000000..77043f29f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm
@@ -0,0 +1,449 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute SHA512 by-2 using SSE
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers:  rax         rdx             r8 r9 r10 r11
+;; Windows preserves:     rbx rcx     rsi rdi rbp               r12 r13 r14 r15
+;;
+;; Linux clobbers:    rax             rsi         r8 r9 r10 r11
+;; Linux preserves:       rbx rcx rdx     rdi rbp               r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+section .data
+default rel
+align 64
+MKGLOBAL(K512_2,data,internal)
+K512_2:
+	dq	0x428a2f98d728ae22, 0x428a2f98d728ae22
+	dq	0x7137449123ef65cd, 0x7137449123ef65cd
+	dq	0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+	dq	0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+	dq	0x3956c25bf348b538, 0x3956c25bf348b538
+	dq	0x59f111f1b605d019, 0x59f111f1b605d019
+	dq	0x923f82a4af194f9b, 0x923f82a4af194f9b
+	dq	0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+	dq	0xd807aa98a3030242, 0xd807aa98a3030242
+	dq	0x12835b0145706fbe, 0x12835b0145706fbe
+	dq	0x243185be4ee4b28c, 0x243185be4ee4b28c
+	dq	0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+	dq	0x72be5d74f27b896f, 0x72be5d74f27b896f
+	dq	0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+	dq	0x9bdc06a725c71235, 0x9bdc06a725c71235
+	dq	0xc19bf174cf692694, 0xc19bf174cf692694
+	dq	0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+	dq	0xefbe4786384f25e3, 0xefbe4786384f25e3
+	dq	0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+	dq	0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+	dq	0x2de92c6f592b0275, 0x2de92c6f592b0275
+	dq	0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+	dq	0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+	dq	0x76f988da831153b5, 0x76f988da831153b5
+	dq	0x983e5152ee66dfab, 0x983e5152ee66dfab
+	dq	0xa831c66d2db43210, 0xa831c66d2db43210
+	dq	0xb00327c898fb213f, 0xb00327c898fb213f
+	dq	0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+	dq	0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+	dq	0xd5a79147930aa725, 0xd5a79147930aa725
+	dq	0x06ca6351e003826f, 0x06ca6351e003826f
+	dq	0x142929670a0e6e70, 0x142929670a0e6e70
+	dq	0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+	dq	0x2e1b21385c26c926, 0x2e1b21385c26c926
+	dq	0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+	dq	0x53380d139d95b3df, 0x53380d139d95b3df
+	dq	0x650a73548baf63de, 0x650a73548baf63de
+	dq	0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+	dq	0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+	dq	0x92722c851482353b, 0x92722c851482353b
+	dq	0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+	dq	0xa81a664bbc423001, 0xa81a664bbc423001
+	dq	0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+	dq	0xc76c51a30654be30, 0xc76c51a30654be30
+	dq	0xd192e819d6ef5218, 0xd192e819d6ef5218
+	dq	0xd69906245565a910, 0xd69906245565a910
+	dq	0xf40e35855771202a, 0xf40e35855771202a
+	dq	0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+	dq	0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+	dq	0x1e376c085141ab53, 0x1e376c085141ab53
+	dq	0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+	dq	0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+	dq	0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+	dq	0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+	dq	0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+	dq	0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+	dq	0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+	dq	0x78a5636f43172f60, 0x78a5636f43172f60
+	dq	0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+	dq	0x8cc702081a6439ec, 0x8cc702081a6439ec
+	dq	0x90befffa23631e28, 0x90befffa23631e28
+	dq	0xa4506cebde82bde9, 0xa4506cebde82bde9
+	dq	0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+	dq	0xc67178f2e372532b, 0xc67178f2e372532b
+	dq	0xca273eceea26619c, 0xca273eceea26619c
+	dq	0xd186b8c721c0c207, 0xd186b8c721c0c207
+	dq	0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+	dq	0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+	dq	0x06f067aa72176fba, 0x06f067aa72176fba
+	dq	0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+	dq	0x113f9804bef90dae, 0x113f9804bef90dae
+	dq	0x1b710b35131c471b, 0x1b710b35131c471b
+	dq	0x28db77f523047d84, 0x28db77f523047d84
+	dq	0x32caab7b40c72493, 0x32caab7b40c72493
+	dq	0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+	dq	0x431d67c49c100d4c, 0x431d67c49c100d4c
+	dq	0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+	dq	0x597f299cfc657e2a, 0x597f299cfc657e2a
+	dq	0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+	dq	0x6c44198c4a475817, 0x6c44198c4a475817
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq	0x08090a0b0c0d0e0f0001020304050607
+	dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+section .text
+
+%ifdef LINUX ; Linux definitions
+ %define arg1    rdi
+ %define arg2    rsi
+%else ; Windows definitions
+ %define arg1    rcx
+ %define arg2    rdx
+%endif
+
+; Common definitions
+%define STATE    arg1
+%define INP_SIZE arg2
+
+%define IDX     rax
+%define ROUND	r8
+%define TBL	r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+
+
+%define SZ2	2*SHA512_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA:		resb	SZ2 * 16
+_DIGEST:	resb	SZ2 * NUM_SHA512_DIGEST_WORDS
+		resb	8 	; for alignment, must be odd multiple of 8
+endstruc
+
+%define MOVPD	movupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+	movapd  %%t0, %%r0		; t0 = a1 a0
+	shufpd	%%r0, %%r1, 00b		; r0 = b0 a0
+	shufpd	%%t0, %%r1, 11b		; t0 = b1 a1
+%endm
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa	%%tmp, %%reg
+	psllq  	%%tmp, (64-(%%imm))
+	psrlq	%%reg, %%imm
+	por		%%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+	PRORQ	%1, %2, TMP
+%endmacro
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	movdqa	a0, e		; sig1: a0 = e
+	movdqa	a1, e		; sig1: s1 = e
+	PRORQ	a0, (18-14)	; sig1: a0 = (e >> 4)
+
+	movdqa	a2, f		; ch: a2 = f
+	pxor	a2, g		; ch: a2 = f^g
+	pand	a2, e		; ch: a2 = (f^g)&e
+	pxor	a2, g		; a2 = ch
+
+	PRORQ	a1, 41		; sig1: a1 = (e >> 41)
+        movdqa	[SZ2*(%%i&0xf) + rsp],%%T1
+	paddq	%%T1,[TBL + ROUND]	; T1 = W + K
+	pxor	a0, e		; sig1: a0 = e ^ (e >> 5)
+	PRORQ	a0, 14		; sig1: a0 = (e >> 14) ^ (e >> 18)
+	paddq	h, a2		; h = h + ch
+	movdqa	a2, a		; sig0: a2 = a
+	PRORQ	a2, (34-28)	; sig0: a2 = (a >> 6)
+	paddq	h, %%T1		; h = h + ch + W + K
+	pxor	a0, a1		; a0 = sigma1
+	movdqa	a1, a		; sig0: a1 = a
+	movdqa	%%T1, a		; maj: T1 = a
+	PRORQ	a1, 39		; sig0: a1 = (a >> 39)
+	pxor	%%T1, c		; maj: T1 = a^c
+	add	ROUND, SZ2	; ROUND++
+	pand	%%T1, b		; maj: T1 = (a^c)&b
+	paddq	h, a0
+
+	paddq	d, h
+
+	pxor	a2, a		; sig0: a2 = a ^ (a >> 11)
+	PRORQ	a2, 28		; sig0: a2 = (a >> 28) ^ (a >> 34)
+	pxor	a2, a1		; a2 = sig0
+	movdqa	a1, a		; maj: a1 = a
+	pand	a1, c		; maj: a1 = a&c
+	por	a1, %%T1	; a1 = maj
+	paddq	h, a1		; h = h + ch + W + K + maj
+	paddq	h, a2		; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	movdqa	%%T1, [SZ2*((%%i-15)&0xf) + rsp]
+	movdqa	a1, [SZ2*((%%i-2)&0xf) + rsp]
+	movdqa	a0, %%T1
+	PRORQ	%%T1, 8-1
+	movdqa	a2, a1
+	PRORQ	a1, 61-19
+	pxor	%%T1, a0
+	PRORQ	%%T1, 1
+	pxor	a1, a2
+	PRORQ	a1, 19
+	psrlq	a0, 7
+	pxor	%%T1, a0
+	psrlq	a2, 6
+	pxor	a1, a2
+	paddq	%%T1, [SZ2*((%%i-16)&0xf) + rsp]
+	paddq	a1, [SZ2*((%%i-7)&0xf) + rsp]
+	paddq	%%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+
+
+;; SHA512_ARGS:
+;;   UINT128 digest[8];  // transposed digests
+;;   UINT8  *data_ptr[2];
+;;
+
+;; void sha512_x2_sse(SHA512_ARGS *args, UINT64 num_blocks);
+;; arg 1 : STATE    : pointer args
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+MKGLOBAL(sha512_x2_sse,function,internal)
+align 32
+sha512_x2_sse:
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+	sub	rsp, STACK_size
+
+	;; Load the pre-transposed incoming digest.
+	movdqa	a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+	DBGPRINTL_XMM "incoming transposed sha512 digest", a, b, c, d, e, f, g, h
+	lea	TBL,[rel K512_2]
+
+	;; load the address of each of the 2 message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _data_ptr_sha512  +0*PTR_SZ]
+	mov	inp1,[STATE + _data_ptr_sha512  +1*PTR_SZ]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+	DBGPRINTL64  "lloop enter INP_SIZE ", INP_SIZE
+	DBGPRINTL64 " IDX = ", IDX
+	;; save old digest
+	movdqa	[rsp + _DIGEST + 0*SZ2], a
+	movdqa	[rsp + _DIGEST + 1*SZ2], b
+	movdqa	[rsp + _DIGEST + 2*SZ2], c
+	movdqa	[rsp + _DIGEST + 3*SZ2], d
+	movdqa	[rsp + _DIGEST + 4*SZ2], e
+	movdqa	[rsp + _DIGEST + 5*SZ2], f
+	movdqa	[rsp + _DIGEST + 6*SZ2], g
+	movdqa	[rsp + _DIGEST + 7*SZ2], h
+
+	DBGPRINTL "incoming data["
+%assign i 0
+%rep 8
+	;; load up the shuffler for little-endian to big-endian format
+	movdqa	TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+	MOVPD	TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+	MOVPD	TT2,[inp1+IDX+i*16]
+	DBGPRINTL_XMM "input message block", TT0
+	TRANSPOSE	TT0, TT2, TT1
+	pshufb	TT0, TMP
+	pshufb	TT1, TMP
+	ROUND_00_15	TT0,(i*2+0)
+	ROUND_00_15	TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+	DBGPRINTL "]"
+	add	IDX, 8 * 16 ;; increment by a message block
+
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	paddq	a, [rsp + _DIGEST + 0*SZ2]
+	paddq	b, [rsp + _DIGEST + 1*SZ2]
+	paddq	c, [rsp + _DIGEST + 2*SZ2]
+	paddq	d, [rsp + _DIGEST + 3*SZ2]
+	paddq	e, [rsp + _DIGEST + 4*SZ2]
+	paddq	f, [rsp + _DIGEST + 5*SZ2]
+	paddq	g, [rsp + _DIGEST + 6*SZ2]
+	paddq	h, [rsp + _DIGEST + 7*SZ2]
+
+	sub	INP_SIZE, 1  ;; unit is blocks
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	movdqa	[STATE + 0*SHA512_DIGEST_ROW_SIZE],a
+	movdqa	[STATE + 1*SHA512_DIGEST_ROW_SIZE],b
+	movdqa	[STATE + 2*SHA512_DIGEST_ROW_SIZE],c
+	movdqa	[STATE + 3*SHA512_DIGEST_ROW_SIZE],d
+	movdqa	[STATE + 4*SHA512_DIGEST_ROW_SIZE],e
+	movdqa	[STATE + 5*SHA512_DIGEST_ROW_SIZE],f
+	movdqa	[STATE + 6*SHA512_DIGEST_ROW_SIZE],g
+	movdqa	[STATE + 7*SHA512_DIGEST_ROW_SIZE],h
+	DBGPRINTL_XMM "exit transposed digest ", a, b, c, d, e, f, g, h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[STATE + _data_ptr_sha512  + 0*PTR_SZ], inp0
+	add	inp1, IDX
+	mov	[STATE + _data_ptr_sha512  + 1*PTR_SZ], inp1
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+        ;; Clear stack frame ((16 + 8)*16 bytes)
+%ifdef SAFE_DATA
+        pxor    xmm0, xmm0
+%assign i 0
+%rep (16+NUM_SHA512_DIGEST_WORDS)
+        movdqa	[rsp + i*SZ2], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+	add	rsp, STACK_size
+DBGPRINTL "====================== exit sha512_x2_sse code =====================\n"
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm
new file mode 100644
index 000000000..954d6597e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm
@@ -0,0 +1,457 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute quad SHA256 using SSE
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers:  rax rbx     rdx             r8 r9 r10 r11 r12
+;; Windows preserves:         rcx     rsi rdi rbp                   r12 r14 r15
+;;
+;; Linux clobbers:    rax rbx         rsi         r8 r9 r10 r11 r12
+;; Linux preserves:           rcx rdx     rdi rbp                   r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+section .data
+default rel
+align 64
+MKGLOBAL(K256_4,data,internal)
+K256_4:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifdef LINUX ; Linux definitions
+ %define arg1 	rdi
+ %define arg2	rsi
+%else ; Windows definitions
+ %define arg1 	rcx
+ %define arg2 	rdx
+%endif
+
+; Common definitions
+%define STATE    arg1
+%define INP_SIZE arg2
+
+%define IDX     rax
+%define ROUND	rbx
+%define TBL	r12
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+%define SZ4	4*SHA256_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 64*SZ4
+
+; Define stack usage
+struc STACK
+_DATA:		resb	SZ4 * 16
+_DIGEST:	resb	SZ4 * NUM_SHA256_DIGEST_WORDS
+		resb	8 	; for alignment, must be odd multiple of 8
+endstruc
+
+%define MOVPS	movups
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	movaps	%%t0, %%r0		; t0 = {a3 a2 a1 a0}
+	shufps	%%t0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
+	shufps	%%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
+
+	movaps	%%t1, %%r2		; t1 = {c3 c2 c1 c0}
+	shufps	%%t1, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
+	shufps	%%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
+
+	movaps	%%r1, %%t0		; r1 = {b1 b0 a1 a0}
+	shufps	%%r1, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
+
+	movaps	%%r3, %%r0		; r3 = {b3 b2 a3 a2}
+	shufps	%%r3, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
+
+	shufps	%%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
+	shufps	%%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa	%%tmp, %%reg
+	psrld	%%reg, %%imm
+	pslld	%%tmp, (32-(%%imm))
+	por	%%reg, %%tmp
+%endmacro
+
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	movdqa	a0, e		; sig1: a0 = e
+	movdqa	a1, e		; sig1: s1 = e
+	PRORD	a0, (11-6)	; sig1: a0 = (e >> 5)
+
+	movdqa	a2, f		; ch: a2 = f
+	pxor	a2, g		; ch: a2 = f^g
+	pand	a2, e		; ch: a2 = (f^g)&e
+	pxor	a2, g		; a2 = ch
+
+	PRORD	a1, 25		; sig1: a1 = (e >> 25)
+	movdqa	[SZ4*(%%i&0xf) + rsp],%%T1
+	paddd	%%T1,[TBL + ROUND]	; T1 = W + K
+	pxor	a0, e		; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	paddd	h, a2		; h = h + ch
+	movdqa	a2, a		; sig0: a2 = a
+	PRORD	a2, (13-2)	; sig0: a2 = (a >> 11)
+	paddd	h, %%T1		; h = h + ch + W + K
+	pxor	a0, a1		; a0 = sigma1
+	movdqa	a1, a		; sig0: a1 = a
+	movdqa	%%T1, a		; maj: T1 = a
+	PRORD	a1, 22		; sig0: a1 = (a >> 22)
+	pxor	%%T1, c		; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	pand	%%T1, b		; maj: T1 = (a^c)&b
+	paddd	h, a0
+
+	paddd	d, h
+
+	pxor	a2, a		; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	pxor	a2, a1		; a2 = sig0
+	movdqa	a1, a		; maj: a1 = a
+	pand	a1, c		; maj: a1 = a&c
+	por	a1, %%T1	; a1 = maj
+	paddd	h, a1		; h = h + ch + W + K + maj
+	paddd	h, a2		; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	movdqa	%%T1, [SZ4*((%%i-15)&0xf) + rsp]
+	movdqa	a1, [SZ4*((%%i-2)&0xf) + rsp]
+	movdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	movdqa	a2, a1
+	PRORD	a1, 19-17
+	pxor	%%T1, a0
+	PRORD	%%T1, 7
+	pxor	a1, a2
+	PRORD	a1, 17
+	psrld	a0, 3
+	pxor	%%T1, a0
+	psrld	a2, 10
+	pxor	a1, a2
+	paddd	%%T1, [SZ4*((%%i-16)&0xf) + rsp]
+	paddd	a1, [SZ4*((%%i-7)&0xf) + rsp]
+	paddd	%%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+
+
+;; SHA256_ARGS:
+;;   UINT128 digest[8];  // transposed digests
+;;   UINT8  *data_ptr[4];
+;;
+
+;; void sha_256_mult_sse(SHA256_ARGS *args, UINT64 num_blocks);
+;; arg 1 : STATE    : pointer args
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+MKGLOBAL(sha_256_mult_sse,function,internal)
+align 32
+sha_256_mult_sse:
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+	sub	rsp, STACK_size
+
+	;; Load the pre-transposed incoming digest.
+	movdqa	a,[STATE + 0 * SHA256_DIGEST_ROW_SIZE ]
+	movdqa	b,[STATE + 1 * SHA256_DIGEST_ROW_SIZE ]
+	movdqa	c,[STATE + 2 * SHA256_DIGEST_ROW_SIZE ]
+	movdqa	d,[STATE + 3 * SHA256_DIGEST_ROW_SIZE ]
+	movdqa	e,[STATE + 4 * SHA256_DIGEST_ROW_SIZE ]
+	movdqa	f,[STATE + 5 * SHA256_DIGEST_ROW_SIZE ]
+	movdqa	g,[STATE + 6 * SHA256_DIGEST_ROW_SIZE ]
+	movdqa	h,[STATE + 7 * SHA256_DIGEST_ROW_SIZE ]
+
+        DBGPRINTL_XMM "incoming transposed sha256 digest", a, b, c, d, e, f, g, h
+	lea	TBL,[rel K256_4]
+
+	;; load the address of each of the 4 message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ]
+	mov	inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ]
+	mov	inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ]
+	mov	inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ]
+        DBGPRINTL64 "incoming input data ptrs ", inp0, inp1, inp2, inp3
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	movdqa	[rsp + _DIGEST + 0*SZ4], a
+	movdqa	[rsp + _DIGEST + 1*SZ4], b
+	movdqa	[rsp + _DIGEST + 2*SZ4], c
+	movdqa	[rsp + _DIGEST + 3*SZ4], d
+	movdqa	[rsp + _DIGEST + 4*SZ4], e
+	movdqa	[rsp + _DIGEST + 5*SZ4], f
+	movdqa	[rsp + _DIGEST + 6*SZ4], g
+	movdqa	[rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+	movdqa	TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+	MOVPS	TT2,[inp0+IDX+i*16]
+	MOVPS	TT1,[inp1+IDX+i*16]
+	MOVPS	TT4,[inp2+IDX+i*16]
+	MOVPS	TT3,[inp3+IDX+i*16]
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	pshufb	TT0, TMP
+	pshufb	TT1, TMP
+	pshufb	TT2, TMP
+	pshufb	TT3, TMP
+	ROUND_00_15	TT0,(i*4+0)
+	ROUND_00_15	TT1,(i*4+1)
+	ROUND_00_15	TT2,(i*4+2)
+	ROUND_00_15	TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+	add	IDX, 4*4*4
+
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	paddd	a, [rsp + _DIGEST + 0*SZ4]
+	paddd	b, [rsp + _DIGEST + 1*SZ4]
+	paddd	c, [rsp + _DIGEST + 2*SZ4]
+	paddd	d, [rsp + _DIGEST + 3*SZ4]
+	paddd	e, [rsp + _DIGEST + 4*SZ4]
+	paddd	f, [rsp + _DIGEST + 5*SZ4]
+	paddd	g, [rsp + _DIGEST + 6*SZ4]
+	paddd	h, [rsp + _DIGEST + 7*SZ4]
+
+	sub	INP_SIZE, 1  ;; unit is blocks
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	movdqa	[STATE+0*SHA256_DIGEST_ROW_SIZE ],a
+	movdqa	[STATE+1*SHA256_DIGEST_ROW_SIZE ],b
+	movdqa	[STATE+2*SHA256_DIGEST_ROW_SIZE ],c
+	movdqa	[STATE+3*SHA256_DIGEST_ROW_SIZE ],d
+	movdqa	[STATE+4*SHA256_DIGEST_ROW_SIZE ],e
+	movdqa	[STATE+5*SHA256_DIGEST_ROW_SIZE ],f
+	movdqa	[STATE+6*SHA256_DIGEST_ROW_SIZE ],g
+	movdqa	[STATE+7*SHA256_DIGEST_ROW_SIZE ],h
+	DBGPRINTL_XMM "updated transposed sha256 digest", a, b, c, d, e, f, g, h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[STATE + _data_ptr_sha256 + 0*8], inp0
+	add	inp1, IDX
+	mov	[STATE + _data_ptr_sha256 + 1*8], inp1
+	add	inp2, IDX
+	mov	[STATE + _data_ptr_sha256 + 2*8], inp2
+	add	inp3, IDX
+	mov	[STATE + _data_ptr_sha256 + 3*8], inp3
+
+        DBGPRINTL64 "updated input data ptrs ", inp0, inp1, inp2, inp3
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+%ifdef SAFE_DATA
+        ;; Clear stack frame ((16 + 8)*16 bytes)
+        pxor    xmm0, xmm0
+%assign i 0
+%rep (16+NUM_SHA256_DIGEST_WORDS)
+        movdqa	[rsp + i*SZ4], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+	add	rsp, STACK_size
+	; outer calling routine restores XMM and other GP registers
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/snow3g_sse.c b/src/spdk/intel-ipsec-mb/sse/snow3g_sse.c
new file mode 100644
index 000000000..aadd85633
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/snow3g_sse.c
@@ -0,0 +1,42 @@
+/*******************************************************************************
+  Copyright (c) 2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define SSE
+#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_sse
+#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_sse
+#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_sse
+#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_sse
+#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_sse
+#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_sse
+#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_sse
+#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_sse
+#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_sse
+#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_sse
+#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_sse
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_sse
+
+#include "include/snow3g_common.h"
diff --git a/src/spdk/intel-ipsec-mb/sse/zuc_sse.asm b/src/spdk/intel-ipsec-mb/sse/zuc_sse.asm
new file mode 100755
index 000000000..0f4e490f9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/zuc_sse.asm
@@ -0,0 +1,1152 @@
+;;
+;; Copyright (c) 2009-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+
+extern lookup_8bit_sse
+
+section .data
+default rel
+align 64
+S0:
+db	0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb
+db	0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90
+db	0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac
+db	0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38
+db	0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b
+db	0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c
+db	0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad
+db	0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8
+db	0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56
+db	0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe
+db	0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d
+db	0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23
+db	0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1
+db	0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f
+db	0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65
+db	0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60
+
+S1:
+db	0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77
+db	0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42
+db	0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1
+db	0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48
+db	0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87
+db	0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb
+db	0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09
+db	0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9
+db	0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9
+db	0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89
+db	0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4
+db	0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde
+db	0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21
+db	0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34
+db	0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28
+db	0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2
+
+EK_d:
+dw	0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF,
+dw	0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC
+
+mask31:
+dd	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
+
+align 16
+bit_reverse_table_l:
+db	0x00, 0x08, 0x04, 0x0c, 0x02, 0x0a, 0x06, 0x0e, 0x01, 0x09, 0x05, 0x0d, 0x03, 0x0b, 0x07, 0x0f
+
+align 16
+bit_reverse_table_h:
+db	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0
+
+align 16
+bit_reverse_and_table:
+db	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
+
+align 16
+data_mask_64bits:
+dd	0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+
+bit_mask_table:
+db	0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
+
+
+section .text
+
+%define MASK31  xmm12
+
+%define OFS_R1  (16*(4*4))
+%define OFS_R2  (OFS_R1 + (4*4))
+%define OFS_X0  (OFS_R2 + (4*4))
+%define OFS_X1  (OFS_X0 + (4*4))
+%define OFS_X2  (OFS_X1 + (4*4))
+%define OFS_X3  (OFS_X2 + (4*4))
+
+%ifidn __OUTPUT_FORMAT__, win64
+        %define XMM_STORAGE     16*10
+%else
+        %define XMM_STORAGE     0
+%endif
+
+%define VARIABLE_OFFSET XMM_STORAGE
+
+%macro FUNC_SAVE 0
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+%ifidn __OUTPUT_FORMAT__, win64
+        push    rdi
+        push    rsi
+%endif
+        mov     r14, rsp
+
+        sub     rsp, VARIABLE_OFFSET
+        and     rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; xmm6:xmm15 need to be maintained for Windows
+        movdqu [rsp + 0*16],xmm6
+        movdqu [rsp + 1*16],xmm7
+        movdqu [rsp + 2*16],xmm8
+        movdqu [rsp + 3*16],xmm9
+        movdqu [rsp + 4*16],xmm10
+        movdqu [rsp + 5*16],xmm11
+        movdqu [rsp + 6*16],xmm12
+        movdqu [rsp + 7*16],xmm13
+        movdqu [rsp + 8*16],xmm14
+        movdqu [rsp + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+        movdqu xmm15, [rsp + 9*16]
+        movdqu xmm14, [rsp + 8*16]
+        movdqu xmm13, [rsp + 7*16]
+        movdqu xmm12, [rsp + 6*16]
+        movdqu xmm11, [rsp + 5*16]
+        movdqu xmm10, [rsp + 4*16]
+        movdqu xmm9, [rsp + 3*16]
+        movdqu xmm8, [rsp + 2*16]
+        movdqu xmm7, [rsp + 1*16]
+        movdqu xmm6, [rsp + 0*16]
+%endif
+        mov     rsp, r14
+%ifidn __OUTPUT_FORMAT__, win64
+        pop     rsi
+        pop     rdi
+%endif
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+%endmacro
+
+
+;
+;   make_u31()
+;
+%macro  make_u31    4
+
+%define %%Rt        %1
+%define %%Ke        %2
+%define %%Ek        %3
+%define %%Iv        %4
+    xor         %%Rt, %%Rt
+    shrd        %%Rt, %%Iv, 8
+    shrd        %%Rt, %%Ek, 15
+    shrd        %%Rt, %%Ke, 9
+%endmacro
+
+
+;
+;   bits_reorg4()
+;
+;   params
+;       %1 - round number
+;       rax - LFSR pointer
+;   uses
+;
+;   return
+;
+%macro  bits_reorg4 1
+    ;
+    ; xmm15 = LFSR_S15
+    ; xmm14 = LFSR_S14
+    ; xmm11 = LFSR_S11
+    ; xmm9  = LFSR_S9
+    ; xmm7  = LFSR_S7
+    ; xmm5  = LFSR_S5
+    ; xmm2  = LFSR_S2
+    ; xmm0  = LFSR_S0
+    ;
+    movdqa      xmm15, [rax + ((15 + %1) % 16)*16]
+    movdqa      xmm14, [rax + ((14 + %1) % 16)*16]
+    movdqa      xmm11, [rax + ((11 + %1) % 16)*16]
+    movdqa      xmm9,  [rax + (( 9 + %1) % 16)*16]
+    movdqa      xmm7,  [rax + (( 7 + %1) % 16)*16]
+    movdqa      xmm5,  [rax + (( 5 + %1) % 16)*16]
+    movdqa      xmm2,  [rax + (( 2 + %1) % 16)*16]
+    movdqa      xmm0,  [rax + (( 0 + %1) % 16)*16]
+
+    pxor        xmm1, xmm1
+    pslld       xmm15, 1
+    movdqa      xmm3, xmm14
+    pblendw     xmm3, xmm1, 0xAA
+    pblendw     xmm15, xmm3, 0x55
+
+    movdqa      [rax + OFS_X0], xmm15   ; BRC_X0
+    pslld       xmm11, 16
+    psrld       xmm9, 15
+    por         xmm11, xmm9
+    movdqa      [rax + OFS_X1], xmm11   ; BRC_X1
+    pslld       xmm7, 16
+    psrld       xmm5, 15
+    por         xmm7, xmm5
+    movdqa      [rax + OFS_X2], xmm7    ; BRC_X2
+    pslld       xmm2, 16
+    psrld       xmm0, 15
+    por         xmm2, xmm0
+    movdqa      [rax + OFS_X3], xmm2    ; BRC_X3
+%endmacro
+
+%macro lookup_single_sbox 2
+%define %%table   %1 ; [in] Pointer to table to look up
+%define %%idx_val %2 ; [in/out] Index to look up and returned value (rcx, rdx, r8, r9)
+
+%ifdef SAFE_LOOKUP
+    ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10)
+    ;; and registers for param passing and return (4 regs, OS dependent)
+    ;; (6*16 + 6*8 = 144 bytes)
+    sub     rsp, 144
+
+    movdqu  [rsp], xmm0
+    movdqu  [rsp + 16], xmm1
+    movdqu  [rsp + 32], xmm2
+    movdqu  [rsp + 48], xmm3
+    movdqu  [rsp + 64], xmm4
+    movdqu  [rsp + 80], xmm5
+    mov     [rsp + 96], r9
+    mov     [rsp + 104], r10
+
+%ifdef LINUX
+    mov     [rsp + 112], rdi
+    mov     [rsp + 120], rsi
+    mov     [rsp + 128], rdx
+    mov     rdi, %%table
+    mov     rsi, %%idx_val
+    mov     rdx, 256
+%else
+%ifnidni %%idx_val, rcx
+    mov     [rsp + 112], rcx
+%endif
+%ifnidni %%idx_val, rdx
+    mov     [rsp + 120], rdx
+%endif
+%ifnidni %%idx_val, r8
+    mov     [rsp + 128], r8
+%endif
+
+    mov     rdx, %%idx_val
+    mov     rcx, %%table
+    mov     r8,  256
+%endif
+    mov     [rsp + 136], rax
+
+    call    lookup_8bit_sse
+
+    ;; Restore all registers
+    movdqu  xmm0, [rsp]
+    movdqu  xmm1, [rsp + 16]
+    movdqu  xmm2, [rsp + 32]
+    movdqu  xmm3, [rsp + 48]
+    movdqu  xmm4, [rsp + 64]
+    movdqu  xmm5, [rsp + 80]
+    mov     r9,   [rsp + 96]
+    mov     r10,  [rsp + 104]
+
+%ifdef LINUX
+    mov     rdi, [rsp + 112]
+    mov     rsi, [rsp + 120]
+    mov     rdx, [rsp + 128]
+%else
+%ifnidni %%idx_val, rcx
+    mov     rcx, [rsp + 112]
+%endif
+%ifnidni %%idx_val, rdx
+    mov     rdx, [rsp + 120]
+%endif
+%ifnidni %%idx_val, rcx
+    mov     r8,  [rsp + 128]
+%endif
+%endif
+
+    ;; Move returned value from lookup function, before restoring rax
+    mov     DWORD(%%idx_val), eax
+    mov     rax, [rsp + 136]
+
+    add     rsp, 144
+
+%else ;; SAFE_LOOKUP
+
+    movzx DWORD(%%idx_val), BYTE [%%table + %%idx_val]
+
+%endif ;; SAFE_LOOKUP
+%endmacro
+
+;
+;   sbox_lkup()
+;
+;   params
+;       %1  R1/R2 table offset
+;       %2  R1/R2 entry offset
+;       %3  xmm reg name
+;   uses
+;       rcx,rdx,r8,r9,r10,rsi
+;   return
+;
+%macro  sbox_lkup   3
+    pextrb      rcx, %3, (0 + (%2 * 4))
+    lookup_single_sbox rsi, rcx
+
+    pextrb      rdx, %3, (1 + (%2 * 4))
+    lookup_single_sbox rdi, rdx
+
+    xor         r10, r10
+    pextrb      r8,  %3, (2 + (%2 * 4))
+    lookup_single_sbox rsi, r8
+    pextrb      r9,  %3, (3 + (%2 * 4))
+    lookup_single_sbox rdi, r9
+
+    shrd        r10d, ecx, 8
+    shrd        r10d, edx, 8
+    shrd        r10d, r8d, 8
+    shrd        r10d, r9d, 8
+    mov         [rax + %1 + (%2 * 4)], r10d
+%endmacro
+
+
+;
+;   rot_mod32()
+;
+;   uses xmm7
+;
+%macro  rot_mod32   3
+    movdqa      %1, %2
+    pslld       %1, %3
+    movdqa      xmm7, %2
+    psrld       xmm7, (32 - %3)
+
+    por         %1, xmm7
+%endmacro
+
+
+;
+;   nonlin_fun4()
+;
+;   params
+;       %1 == 1, then calculate W
+;   uses
+;
+;   return
+;       xmm0 = W value, updates F_R1[] / F_R2[]
+;
+%macro nonlin_fun4  1
+
+%if (%1 == 1)
+    movdqa      xmm0, [rax + OFS_X0]
+    pxor        xmm0, [rax + OFS_R1]
+    paddd       xmm0, [rax + OFS_R2]    ; W = (BRC_X0 ^ F_R1) + F_R2
+%endif
+    ;
+    movdqa      xmm1, [rax + OFS_R1]
+    movdqa      xmm2, [rax + OFS_R2]
+    paddd       xmm1, [rax + OFS_X1]    ; W1 = F_R1 + BRC_X1
+    pxor        xmm2, [rax + OFS_X2]    ; W2 = F_R2 ^ BRC_X2
+    ;
+
+    movdqa      xmm3, xmm1
+    movdqa      xmm4, xmm1
+    movdqa      xmm5, xmm2
+    movdqa      xmm6, xmm2
+    pslld       xmm3, 16
+    psrld       xmm4, 16
+    pslld       xmm5, 16
+    psrld       xmm6, 16
+    movdqa      xmm1, xmm3
+    movdqa      xmm2, xmm4
+    por         xmm1, xmm6
+    por         xmm2, xmm5
+
+    ;
+    rot_mod32   xmm3, xmm1, 2
+    rot_mod32   xmm4, xmm1, 10
+    rot_mod32   xmm5, xmm1, 18
+    rot_mod32   xmm6, xmm1, 24
+    pxor        xmm1, xmm3
+    pxor        xmm1, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm1, xmm6      ; XMM1 = U = L1(P)
+
+    sbox_lkup   OFS_R1, 0, xmm1     ; F_R1[0]
+    sbox_lkup   OFS_R1, 1, xmm1     ; F_R1[1]
+    sbox_lkup   OFS_R1, 2, xmm1     ; F_R1[2]
+    sbox_lkup   OFS_R1, 3, xmm1     ; F_R1[3]
+    ;
+    rot_mod32   xmm3, xmm2, 8
+    rot_mod32   xmm4, xmm2, 14
+    rot_mod32   xmm5, xmm2, 22
+    rot_mod32   xmm6, xmm2, 30
+    pxor        xmm2, xmm3
+    pxor        xmm2, xmm4
+    pxor        xmm2, xmm5
+    pxor        xmm2, xmm6      ; XMM2 = V = L2(Q)
+    ;
+
+    sbox_lkup   OFS_R2, 0, xmm2     ; F_R2[0]
+    sbox_lkup   OFS_R2, 1, xmm2     ; F_R2[1]
+    sbox_lkup   OFS_R2, 2, xmm2     ; F_R2[2]
+    sbox_lkup   OFS_R2, 3, xmm2     ; F_R2[3]
+%endmacro
+
+
+;
+;   store_kstr4()
+;
+;   params
+;
+;   uses
+;       xmm0 as input
+;   return
+;
+%macro  store_kstr4 0
+    pxor        xmm0, [rax + OFS_X3]
+    pextrd      r15d, xmm0, 3
+    pop         r9              ; *pKeyStr4
+    pextrd      r14d, xmm0, 2
+    pop         r8              ; *pKeyStr3
+    pextrd      r13d, xmm0, 1
+    pop         rdx             ; *pKeyStr2
+    pextrd      r12d, xmm0, 0
+    pop         rcx             ; *pKeyStr1
+    mov         [r9], r15d
+    mov         [r8], r14d
+    mov         [rdx], r13d
+    mov         [rcx], r12d
+    add         rcx, 4
+    add         rdx, 4
+    add         r8, 4
+    add         r9, 4
+    push        rcx
+    push        rdx
+    push        r8
+    push        r9
+%endmacro
+
+
+;
+;   add_mod31()
+;       add two 32-bit args and reduce mod (2^31-1)
+;   params
+;       %1  - arg1/res
+;       %2  - arg2
+;   uses
+;       xmm2
+;   return
+;       %1
+%macro  add_mod31   2
+    paddd       %1, %2
+    movdqa     xmm2, %1
+    psrld      xmm2, 31
+    pand        %1, MASK31
+    paddd       %1, xmm2
+%endmacro
+
+
+;
+;   rot_mod31()
+;       rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1)
+;   params
+;       %1  - arg
+;       %2  - # of bits
+;   uses
+;       xmm2
+;   return
+;       %1
+%macro  rot_mod31   2
+
+    movdqa     xmm2, %1
+    pslld      xmm2, %2
+    psrld      %1, (31 - %2)
+
+    por         %1, xmm2
+    pand        %1, MASK31
+%endmacro
+
+
+;
+;   lfsr_updt4()
+;
+;   params
+;       %1 - round number
+;   uses
+;       xmm0 as input (ZERO or W)
+;   return
+;
+%macro  lfsr_updt4  1
+    ;
+    ; xmm1  = LFSR_S0
+    ; xmm4  = LFSR_S4
+    ; xmm10 = LFSR_S10
+    ; xmm13 = LFSR_S13
+    ; xmm15 = LFSR_S15
+    ;
+    pxor        xmm3, xmm3
+    movdqa      xmm1,  [rax + (( 0 + %1) % 16)*16]
+    movdqa      xmm4,  [rax + (( 4 + %1) % 16)*16]
+    movdqa      xmm10, [rax + ((10 + %1) % 16)*16]
+    movdqa      xmm13, [rax + ((13 + %1) % 16)*16]
+    movdqa      xmm15, [rax + ((15 + %1) % 16)*16]
+
+    ; Calculate LFSR feedback
+    add_mod31   xmm0, xmm1
+    rot_mod31   xmm1, 8
+    add_mod31   xmm0, xmm1
+    rot_mod31   xmm4, 20
+    add_mod31   xmm0, xmm4
+    rot_mod31   xmm10, 21
+    add_mod31   xmm0, xmm10
+    rot_mod31   xmm13, 17
+    add_mod31   xmm0, xmm13
+    rot_mod31   xmm15, 15
+    add_mod31   xmm0, xmm15
+
+
+
+    movdqa      [rax + (( 0 + %1) % 16)*16], xmm0
+
+    ; LFSR_S16 = (LFSR_S15++) = eax
+%endmacro
+
+
+;
+;   key_expand_4()
+;
+%macro  key_expand_4  2
+    movzx       r8d, byte [rdi +  (%1 + 0)]
+    movzx       r9d, word [rbx + ((%1 + 0)*2)]
+    movzx       r10d, byte [rsi + (%1 + 0)]
+    make_u31    r11d, r8d, r9d, r10d
+    mov         [rax +  (((%1 + 0)*16)+(%2*4))], r11d
+
+    movzx       r12d, byte [rdi +  (%1 + 1)]
+    movzx       r13d, word [rbx + ((%1 + 1)*2)]
+    movzx       r14d, byte [rsi +  (%1 + 1)]
+    make_u31    r15d, r12d, r13d, r14d
+    mov         [rax +  (((%1 + 1)*16)+(%2*4))], r15d
+%endmacro
+
+MKGLOBAL(asm_ZucInitialization_4_sse,function,internal)
+asm_ZucInitialization_4_sse:
+
+%ifdef LINUX
+	%define		pKe	rdi
+	%define		pIv	rsi
+	%define		pState	rdx
+%else
+	%define		pKe	rcx
+	%define		pIv	rdx
+	%define		pState	r8
+%endif
+
+    ; Save non-volatile registers
+    push    rbx
+    push    rdi
+    push    rsi
+    push    r12
+    push    r13
+    push    r14
+    push    r15
+    push    rdx
+
+    lea     rax, [pState]      ; load pointer to LFSR
+    push    pState             ; Save LFSR Pointer to stack
+
+    ; setup the key pointer for first buffer key expand
+    mov     rbx, [pKe]      ; load the pointer to the array of keys into rbx
+
+    push    pKe             ; save rdi (key pointer) to the stack
+    lea     rdi, [rbx]      ; load the pointer to the first key into rdi
+
+
+    ; setup the IV pointer for first buffer key expand
+    mov     rcx, [pIv]      ; load the pointer to the array of IV's
+    push    pIv             ; save the IV pointer to the stack
+    lea     rsi, [rcx]      ; load the first IV pointer
+
+    lea     rbx, [EK_d]     ; load D variables
+
+    ; Expand key packet 1
+    key_expand_4  0, 0
+    key_expand_4  2, 0
+    key_expand_4  4, 0
+    key_expand_4  6, 0
+    key_expand_4  8, 0
+    key_expand_4  10, 0
+    key_expand_4  12, 0
+    key_expand_4  14, 0
+
+
+    ;second packet key expand here - reset pointers
+    pop     rdx             ; get IV array pointer from Stack
+    mov     rcx, [rdx+8]      ; load offset to IV 2 in array
+    lea     rsi, [rcx]    ; load pointer to IV2
+
+    pop     rbx             ; get Key array pointer from Stack
+    mov     rcx, [rbx+8]      ; load offset to key 2 in array
+    lea     rdi, [rcx]    ; load pointer to Key 2
+
+    push    rbx             ; save Key pointer
+    push    rdx             ; save IV pointer
+
+    lea     rbx, [EK_d]
+
+    ; Expand key packet 2
+    key_expand_4  0, 1
+    key_expand_4  2, 1
+    key_expand_4  4, 1
+    key_expand_4  6, 1
+    key_expand_4  8, 1
+    key_expand_4  10, 1
+    key_expand_4  12, 1
+    key_expand_4  14, 1
+
+
+
+    ;Third packet key expand here - reset pointers
+    pop     rdx             ; get IV array pointer from Stack
+    mov     rcx, [rdx+16]      ; load offset to IV 3 in array
+    lea     rsi, [rcx]    ; load pointer to IV3
+
+    pop     rbx             ; get Key array pointer from Stack
+    mov     rcx, [rbx+16]      ; load offset to key 3 in array
+    lea     rdi, [rcx]    ; load pointer to Key 3
+
+    push    rbx             ; save Key pointer
+    push    rdx             ; save IV pointer
+    lea     rbx, [EK_d]
+    ; Expand key packet 3
+    key_expand_4  0, 2
+    key_expand_4  2, 2
+    key_expand_4  4, 2
+    key_expand_4  6, 2
+    key_expand_4  8, 2
+    key_expand_4  10, 2
+    key_expand_4  12, 2
+    key_expand_4  14, 2
+
+
+
+    ;fourth packet key expand here - reset pointers
+    pop     rdx             ; get IV array pointer from Stack
+    mov     rcx, [rdx+24]      ; load offset to IV 4 in array
+    lea     rsi, [rcx]   ; load pointer to IV4
+
+    pop     rbx             ; get Key array pointer from Stack
+    mov     rcx, [rbx+24]      ; load offset to key 2 in array
+    lea     rdi, [rcx]   ; load pointer to Key 2
+    lea     rbx, [EK_d]
+    ; Expand key packet 4
+    key_expand_4  0, 3
+    key_expand_4  2, 3
+    key_expand_4  4, 3
+    key_expand_4  6, 3
+    key_expand_4  8, 3
+    key_expand_4  10, 3
+    key_expand_4  12, 3
+    key_expand_4  14, 3
+
+    ; Set R1 and R2 to zero
+    ;xor     r10, r10
+    ;xor     r11, r11
+
+
+
+    ; Load read-only registers
+	lea     rdi, [S0]       ; used by sbox_lkup() macro
+    lea     rsi, [S1]
+    movdqa  xmm12, [mask31]
+
+    ; Shift LFSR 32-times, update state variables
+%assign N 0
+%rep 32
+    pop     rdx
+    lea     rax, [rdx]
+    push    rdx
+
+    bits_reorg4 N
+    nonlin_fun4 1
+    psrld  xmm0,1         ; Shift out LSB of W
+
+    pop     rdx
+    lea     rax, [rdx]
+    push    rdx
+
+    lfsr_updt4  N           ; W (xmm0) used in LFSR update - not set to zero
+%assign N N+1
+%endrep
+
+    ; And once more, initial round from keygen phase = 33 times
+    pop     rdx
+    lea     rax, [rdx]
+    push    rdx
+
+    bits_reorg4 0
+    nonlin_fun4 0
+
+    pop     rdx
+    lea     rax, [rdx]
+
+    pxor    xmm0, xmm0
+    lfsr_updt4  0
+
+
+
+    ; Restore non-volatile registers
+    pop        rdx
+    pop         r15
+    pop         r14
+    pop         r13
+    pop         r12
+    pop         rsi
+    pop         rdi
+    pop         rbx
+
+    ret
+;
+;
+;
+;;
+;; void asm_ZucGenKeystream64B_4_sse(state4_t *pSta, u32* pKeyStr1, u32* pKeyStr2, u32* pKeyStr3, u32* pKeyStr4);
+;;
+;; WIN64
+;;  RCX    - pSta
+;;  RDX    - pKeyStr1
+;;  R8     - pKeyStr2
+;;  R9     - pKeyStr3
+;;  Stack  - pKeyStr4
+;;
+;; LIN64
+;;  RDI - pSta
+;;  RSI - pKeyStr1
+;;  RDX - pKeyStr2
+;;  RCX - pKeyStr3
+;;  R8  - pKeyStr4
+;;
+MKGLOBAL(asm_ZucGenKeystream64B_4_sse,function,internal)
+asm_ZucGenKeystream64B_4_sse:
+
+%ifdef LINUX
+	%define		pState	rdi
+	%define		pKS1	rsi
+	%define		pKS2	rdx
+	%define		pKS3	rcx
+	%define		pKS4	r8
+%else
+	%define		pState	rcx
+	%define		pKS1	rdx
+	%define		pKS2	r8
+	%define		pKS3	r9
+        %define         pKS4    rax
+%endif
+
+%ifndef LINUX
+    mov         rax, [rsp + 8*5] ; 5th parameter from stack
+%endif
+
+    ; Save non-volatile registers
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+%ifndef LINUX
+    push        rdi
+    push        rsi
+%endif
+    ; Store 4 keystream pointers on the stack
+
+    push        pKS1
+    push        pKS2
+    push        pKS3
+    push        pKS4
+
+
+    ; Load state pointer in RAX
+    mov         rax, pState
+
+
+    ; Load read-only registers
+    lea         rdi, [S0]       ; used by sbox_lkup() macro
+    lea         rsi, [S1]
+    movdqa      xmm12, [mask31]
+
+    ; Generate 64B of keystream in 16 rounds
+%assign N 1
+%rep 16
+    bits_reorg4 N
+    nonlin_fun4 1
+    store_kstr4
+    pxor        xmm0, xmm0
+    lfsr_updt4  N
+%assign N N+1
+%endrep
+
+    ; Take keystream pointers off (#push = #pops)
+    pop         rax
+    pop         rax
+    pop         rax
+    pop         rax
+
+%ifndef LINUX
+    pop        rsi
+    pop        rdi
+%endif
+
+    ; Restore non-volatile registers
+    pop         r15
+    pop         r14
+    pop         r13
+    pop         r12
+    pop         rbx
+    ret
+
+;;
+;; extern uint32_t Zuc_Eia3_Remainder_sse(const void *ks, const void *data, uint64_t n_bits)
+;;
+;; Returns authentication update value to be XOR'ed with current authentication tag
+;;
+;; WIN64
+;;	RCX - KS (key stream pointer)
+;; 	RDX - DATA (data pointer)
+;;      R8  - N_BITS (number data bits to process)
+;; LIN64
+;;	RDI - KS (key stream pointer)
+;;	RSI - DATA (data pointer)
+;;      RDX - N_BITS (number data bits to process)
+;;
+align 16
+MKGLOBAL(asm_Eia3RemainderSSE,function,internal)
+asm_Eia3RemainderSSE:
+%ifdef LINUX
+	%define		KS	rdi
+	%define		DATA	rsi
+	%define		N_BITS	rdx
+%else
+	%define		KS	rcx
+	%define		DATA	rdx
+	%define		N_BITS	r8
+%endif
+
+        FUNC_SAVE
+
+        movdqa  xmm5, [bit_reverse_table_l]
+        movdqa  xmm6, [bit_reverse_table_h]
+        movdqa  xmm7, [bit_reverse_and_table]
+        movdqa  xmm10, [data_mask_64bits]
+
+        pxor    xmm9, xmm9
+
+%rep 3
+        cmp     N_BITS, 128
+        jb      Eia3RoundsSSE_dq_end
+
+        ;; read 16 bytes and reverse bits
+        movdqu  xmm0, [DATA]
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmm7
+
+        movdqa  xmm2, xmm7
+        pandn   xmm2, xmm0
+        psrld   xmm2, 4
+
+        movdqa  xmm8, xmm6      ; bit reverse low nibbles (use high table)
+        pshufb  xmm8, xmm1
+
+        movdqa  xmm4, xmm5      ; bit reverse high nibbles (use low table)
+        pshufb  xmm4, xmm2
+
+        por     xmm8, xmm4
+        ; xmm8 - bit reversed data bytes
+
+        ;; ZUC authentication part
+        ;; - 4x32 data bits
+        ;; - set up KS
+        movdqu  xmm3, [KS + (0*4)]
+        movdqu  xmm4, [KS + (2*4)]
+        pshufd  xmm0, xmm3, 0x61
+        pshufd  xmm1, xmm4, 0x61
+
+        ;;  - set up DATA
+        movdqa  xmm2, xmm8
+        pand    xmm2, xmm10
+        pshufd  xmm3, xmm2, 0xdc
+        movdqa  xmm4, xmm3
+
+        psrldq  xmm8, 8
+        pshufd  xmm13, xmm8, 0xdc
+        movdqa  xmm14, xmm13
+
+        ;; - clmul
+        ;; - xor the results from 4 32-bit words together
+        pclmulqdq xmm3, xmm0, 0x00
+        pclmulqdq xmm4, xmm0, 0x11
+        pclmulqdq xmm13, xmm1, 0x00
+        pclmulqdq xmm14, xmm1, 0x11
+
+        pxor    xmm3, xmm4
+        pxor    xmm13, xmm14
+        pxor    xmm9, xmm3
+        pxor    xmm9, xmm13
+        lea     DATA, [DATA + 16]
+        lea     KS, [KS + 16]
+        sub     N_BITS, 128
+%endrep
+Eia3RoundsSSE_dq_end:
+
+%rep 3
+        cmp     N_BITS, 32
+        jb      Eia3RoundsSSE_dw_end
+
+        ;; swap dwords in KS
+        movq    xmm1, [KS]
+        pshufd  xmm4, xmm1, 0xf1
+
+        ;;  bit-reverse 4 bytes of data
+        movdqa  xmm2, xmm7
+        movd    xmm0, [DATA]
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmm2
+
+        pandn   xmm2, xmm0
+        psrld   xmm2, 4
+
+        movdqa  xmm0, xmm6    ; bit reverse low nibbles (use high table)
+        pshufb  xmm0, xmm1
+
+        movdqa  xmm3, xmm5    ; bit reverse high nibbles (use low table)
+        pshufb  xmm3, xmm2
+
+        por     xmm0, xmm3
+
+        ;; rol & xor
+        pclmulqdq xmm0, xmm4, 0
+        pxor    xmm9, xmm0
+
+        lea     DATA, [DATA + 4]
+        lea     KS, [KS + 4]
+        sub     N_BITS, 32
+%endrep
+
+Eia3RoundsSSE_dw_end:
+        movq    rax, xmm9
+        shr     rax, 32
+
+        or      N_BITS, N_BITS
+        jz      Eia3RoundsSSE_byte_loop_end
+
+        ;; get 64-bit key stream for the last data bits (less than 32)
+        mov     KS, [KS]
+
+;        ;; process remaining data bytes and bits
+Eia3RoundsSSE_byte_loop:
+        or      N_BITS, N_BITS
+        jz      Eia3RoundsSSE_byte_loop_end
+
+        cmp     N_BITS, 8
+        jb      Eia3RoundsSSE_byte_partial
+
+        movzx   r11, byte [DATA]
+        sub     N_BITS, 8
+        jmp     Eia3RoundsSSE_byte_read
+
+Eia3RoundsSSE_byte_partial:
+        ;; process remaining bits (up to 7)
+        lea     r11, [bit_mask_table]
+        movzx   r10, byte [r11 + N_BITS]
+        movzx   r11, byte [DATA]
+        and     r11, r10
+        xor     N_BITS, N_BITS
+Eia3RoundsSSE_byte_read:
+
+%assign DATATEST 0x80
+%rep 8
+        xor     r10, r10
+        test    r11, DATATEST
+        cmovne  r10, KS
+        xor     rax, r10
+        rol     KS, 1
+%assign DATATEST (DATATEST >> 1)
+%endrep                 ; byte boundary
+        lea     DATA, [DATA + 1]
+        jmp     Eia3RoundsSSE_byte_loop
+
+Eia3RoundsSSE_byte_loop_end:
+
+        ;; eax - holds the return value at this stage
+
+        FUNC_RESTORE
+
+        ret
+
+;;
+;;extern uint32_t Zuc_Eia3_Round64B_sse(uint32_t T, const void *KS, const void *DATA)
+;;
+;; Updates authentication tag T based on keystream KS and DATA.
+;; - it processes 64 bytes of DATA
+;; - reads data in 16 byte chunks and bit reverses them
+;; - reads and re-arranges KS
+;; - employs clmul for the XOR & ROL part
+;; - copies top 64 butes of KS to bottom (for the next round)
+;;
+;; WIN64
+;;	RCX - T
+;;	RDX - KS pointer to key stream (2 x 64 bytes)
+;;;     R8  - DATA pointer to data
+;; LIN64
+;;	RDI - T
+;;	RSI - KS pointer to key stream (2 x 64 bytes)
+;;      RDX - DATA pointer to data
+;;
+align 16
+MKGLOBAL(asm_Eia3Round64BSSE,function,internal)
+asm_Eia3Round64BSSE:
+
+%ifdef LINUX
+	%define		T	edi
+	%define		KS	rsi
+	%define		DATA	rdx
+%else
+	%define		T	ecx
+	%define		KS	rdx
+	%define		DATA	r8
+%endif
+
+        FUNC_SAVE
+
+        movdqa  xmm5, [bit_reverse_table_l]
+        movdqa  xmm6, [bit_reverse_table_h]
+        movdqa  xmm7, [bit_reverse_and_table]
+        movdqa  xmm10, [data_mask_64bits]
+
+        pxor    xmm9, xmm9
+
+%assign I 0
+%rep 4
+        ;; read 16 bytes and reverse bits
+        movdqu  xmm0, [DATA + 16*I]
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmm7
+
+        movdqa  xmm2, xmm7
+        pandn   xmm2, xmm0
+        psrld   xmm2, 4
+
+        movdqa  xmm8, xmm6      ; bit reverse low nibbles (use high table)
+        pshufb  xmm8, xmm1
+
+        movdqa  xmm4, xmm5      ; bit reverse high nibbles (use low table)
+        pshufb  xmm4, xmm2
+
+        por     xmm8, xmm4
+        ; xmm8 - bit reversed data bytes
+
+        ;; ZUC authentication part
+        ;; - 4x32 data bits
+        ;; - set up KS
+%if I != 0
+        movdqa  xmm0, xmm12
+        movdqu  xmm2, [KS + (I*16) + (4*4)]
+        movdqa  xmm12, xmm2
+        palignr xmm2, xmm0, 8
+        pshufd  xmm1, xmm0, 0x61
+        pshufd  xmm11, xmm2, 0x61
+%else
+        movdqu  xmm2, [KS + (I*16) + (0*4)]
+        movdqu  xmm3, [KS + (I*16) + (4*4)]
+        movdqa  xmm12, xmm3
+        palignr xmm3, xmm2, 8
+        pshufd  xmm1, xmm2, 0x61
+        pshufd  xmm11, xmm3, 0x61
+%endif
+
+        ;;  - set up DATA
+        movdqa  xmm0, xmm8
+        pand    xmm0, xmm10
+        pshufd  xmm3, xmm0, 0xdc
+        movdqa  xmm0, xmm3
+
+        psrldq  xmm8, 8
+        pshufd  xmm13, xmm8, 0xdc
+        movdqa  xmm14, xmm13
+
+        ;; - clmul
+        ;; - xor the results from 4 32-bit words together
+        pclmulqdq xmm0, xmm1, 0x00
+        pclmulqdq xmm3, xmm1, 0x11
+        pclmulqdq xmm14, xmm11, 0x00
+        pclmulqdq xmm13, xmm11, 0x11
+
+        pxor    xmm3, xmm0
+        pxor    xmm13, xmm14
+        pxor    xmm9, xmm3
+        pxor    xmm9, xmm13
+
+%assign I (I + 1)
+%endrep
+
+        ;; - update T
+        movq    rax, xmm9
+        shr     rax, 32
+        xor     eax, T
+
+        FUNC_RESTORE
+
+        ret
+
+
+;----------------------------------------------------------------------------------------
+;----------------------------------------------------------------------------------------
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/zuc_sse_top.c b/src/spdk/intel-ipsec-mb/sse/zuc_sse_top.c
new file mode 100755
index 000000000..5a4eb98c5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/zuc_sse_top.c
@@ -0,0 +1,554 @@
+/*******************************************************************************
+  Copyright (c) 2009-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*-----------------------------------------------------------------------
+* zuc_sse.c
+*-----------------------------------------------------------------------
+* An implementation of ZUC, the core algorithm for the
+* 3GPP Confidentiality and Integrity algorithms.
+*
+*-----------------------------------------------------------------------*/
+
+#include <string.h>
+
+#include "include/zuc_internal.h"
+#include "include/wireless_common.h"
+#include "include/save_xmms.h"
+#include "include/clear_regs_mem.h"
+#include "intel-ipsec-mb.h"
+
+#define SAVE_XMMS               save_xmms
+#define RESTORE_XMMS            restore_xmms
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_sse
+
+static inline
+void _zuc_eea3_1_buffer_sse(const void *pKey,
+                           const void *pIv,
+                           const void *pBufferIn,
+                           void *pBufferOut,
+                           const uint32_t length)
+{
+        DECLARE_ALIGNED(ZucState_t zucState, 64);
+        DECLARE_ALIGNED(uint8_t keyStream[64], 64);
+        /* buffer to store 64 bytes of keystream */
+        DECLARE_ALIGNED(uint8_t tempSrc[64], 64);
+        DECLARE_ALIGNED(uint8_t tempDst[64], 64);
+
+        const uint64_t *pIn64 = NULL;
+        const uint8_t *pIn8 = NULL;
+        uint8_t *pOut8 = NULL;
+        uint64_t *pOut64 = NULL, *pKeyStream64 = NULL;
+        uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL;
+
+        uint32_t numKeyStreamsPerPkt = length/ ZUC_KEYSTR_LEN;
+        uint32_t numBytesLeftOver = length % ZUC_KEYSTR_LEN;
+
+        /* need to set the LFSR state to zero */
+        memset(&zucState, 0, sizeof(ZucState_t));
+
+        /* initialize the zuc state */
+        asm_ZucInitialization(pKey, pIv, &(zucState));
+
+        /* Loop Over all the Quad-Words in input buffer and XOR with the 64bits
+         * of generated keystream */
+        pOut64 = (uint64_t *) pBufferOut;
+        pIn64 = (const uint64_t *) pBufferIn;
+
+        while (numKeyStreamsPerPkt--) {
+                /* Generate the key stream 64 bytes at a time */
+                asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState);
+
+                /* XOR The Keystream generated with the input buffer here */
+                pKeyStream64 = (uint64_t *) keyStream;
+                asm_XorKeyStream64B_sse(pIn64, pOut64, pKeyStream64);
+                pIn64 += 8;
+                pOut64 += 8;
+        }
+
+        /* Check for remaining 0 to 63 bytes */
+        pIn8 = (const uint8_t *) pBufferIn;
+        pOut8 = (uint8_t *) pBufferOut;
+        if(numBytesLeftOver) {
+                asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState);
+
+                /* copy the remaining bytes into temporary buffer and XOR with
+                 * the 64-bytes of keystream. Then copy on the valid bytes back
+                 * to the output buffer */
+
+                memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver],
+                       numBytesLeftOver);
+                pKeyStream64 = (uint64_t *) &keyStream[0];
+                pTemp64 = (uint64_t *) &tempSrc[0];
+                pdstTemp64 = (uint64_t *) &tempDst[0];
+
+                asm_XorKeyStream64B_sse(pTemp64, pdstTemp64, pKeyStream64);
+                memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0],
+                       numBytesLeftOver);
+
+        }
+#ifdef SAFE_DATA
+        /* Clear sensitive data in stack */
+        clear_mem(keyStream, sizeof(keyStream));
+        clear_mem(&zucState, sizeof(zucState));
+#endif
+}
+
+static inline
+void _zuc_eea3_4_buffer_sse(const void * const pKey[4],
+                            const void * const pIv[4],
+                            const void * const pBufferIn[4],
+                            void *pBufferOut[4],
+                            const uint32_t length[4])
+{
+
+        DECLARE_ALIGNED(ZucState4_t state, 64);
+        DECLARE_ALIGNED(ZucState_t singlePktState, 64);
+
+        unsigned int i = 0;
+        /* Calculate the minimum input packet size */
+        uint32_t bytes1 = (length[0] < length[1] ?
+                           length[0] : length[1]);
+        uint32_t bytes2 = (length[2] < length[3] ?
+                           length[2] : length[3]);
+        /* min number of bytes */
+        uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2;
+        uint32_t numKeyStreamsPerPkt = bytes/ZUC_KEYSTR_LEN;
+        uint32_t remainBytes[4] = {0};
+        DECLARE_ALIGNED(uint8_t keyStr1[64], 64);
+        DECLARE_ALIGNED(uint8_t keyStr2[64], 64);
+        DECLARE_ALIGNED(uint8_t keyStr3[64], 64);
+        DECLARE_ALIGNED(uint8_t keyStr4[64], 64);
+        DECLARE_ALIGNED(uint8_t tempSrc[64], 64);
+        DECLARE_ALIGNED(uint8_t tempDst[64], 64);
+        /* structure to store the 4 keys */
+        DECLARE_ALIGNED(ZucKey4_t keys, 64);
+        /* structure to store the 4 IV's */
+        DECLARE_ALIGNED(ZucIv4_t ivs, 64);
+        uint32_t numBytesLeftOver = 0;
+        const uint8_t *pTempBufInPtr = NULL;
+        uint8_t *pTempBufOutPtr = NULL;
+
+        const uint64_t *pIn64_0 = NULL;
+        const uint64_t *pIn64_1 = NULL;
+        const uint64_t *pIn64_2 = NULL;
+        const uint64_t *pIn64_3 = NULL;
+        uint64_t *pOut64_0 = NULL;
+        uint64_t *pOut64_1 = NULL;
+        uint64_t *pOut64_2 = NULL;
+        uint64_t *pOut64_3 = NULL;
+        uint64_t *pTempSrc64 = NULL;
+        uint64_t *pTempDst64 = NULL;
+        uint64_t *pKeyStream64 = NULL;
+
+        /* rounded down minimum length */
+        bytes = numKeyStreamsPerPkt * ZUC_KEYSTR_LEN;
+
+        /* Need to set the LFSR state to zero */
+        memset(&state, 0, sizeof(ZucState4_t));
+
+        /* Calculate the number of bytes left over for each packet */
+        for (i=0; i< 4; i++)
+                remainBytes[i] = length[i] - bytes;
+
+        /* Setup the Keys */
+        keys.pKey1 = pKey[0];
+        keys.pKey2 = pKey[1];
+        keys.pKey3 = pKey[2];
+        keys.pKey4 = pKey[3];
+
+        /* setup the IV's */
+        ivs.pIv1 = pIv[0];
+        ivs.pIv2 = pIv[1];
+        ivs.pIv3 = pIv[2];
+        ivs.pIv4 = pIv[3];
+
+        asm_ZucInitialization_4_sse( &keys,  &ivs, &state);
+
+        pOut64_0 = (uint64_t *) pBufferOut[0];
+        pOut64_1 = (uint64_t *) pBufferOut[1];
+        pOut64_2 = (uint64_t *) pBufferOut[2];
+        pOut64_3 = (uint64_t *) pBufferOut[3];
+
+        pIn64_0 = (const uint64_t *) pBufferIn[0];
+        pIn64_1 = (const uint64_t *) pBufferIn[1];
+        pIn64_2 = (const uint64_t *) pBufferIn[2];
+        pIn64_3 = (const uint64_t *) pBufferIn[3];
+
+        /* Loop for 64 bytes at a time generating 4 key-streams per loop */
+        while (numKeyStreamsPerPkt) {
+                /* Generate 64 bytes at a time */
+                asm_ZucGenKeystream64B_4_sse(&state,
+                                             (uint32_t *) keyStr1,
+                                             (uint32_t *) keyStr2,
+                                             (uint32_t *) keyStr3,
+                                             (uint32_t *) keyStr4);
+
+                /* XOR the KeyStream with the input buffers and store in output
+                 * buffer*/
+                pKeyStream64 = (uint64_t *) keyStr1;
+                asm_XorKeyStream64B_sse(pIn64_0, pOut64_0, pKeyStream64);
+                pIn64_0 += 8;
+                pOut64_0 += 8;
+
+                pKeyStream64 = (uint64_t *) keyStr2;
+                asm_XorKeyStream64B_sse(pIn64_1, pOut64_1, pKeyStream64);
+                pIn64_1 += 8;
+                pOut64_1 += 8;
+
+                pKeyStream64 = (uint64_t *) keyStr3;
+                asm_XorKeyStream64B_sse(pIn64_2, pOut64_2, pKeyStream64);
+                pIn64_2 += 8;
+                pOut64_2 += 8;
+
+                pKeyStream64 = (uint64_t *) keyStr4;
+                asm_XorKeyStream64B_sse(pIn64_3, pOut64_3, pKeyStream64);
+                pIn64_3 += 8;
+                pOut64_3 += 8;
+
+                /* Update keystream count */
+                numKeyStreamsPerPkt--;
+
+        }
+
+        /* process each packet separately for the remaining bytes */
+        for (i = 0; i < 4; i++) {
+                if (remainBytes[i]) {
+                        /* need to copy the zuc state to single packet state */
+                        singlePktState.lfsrState[0] = state.lfsrState[0][i];
+                        singlePktState.lfsrState[1] = state.lfsrState[1][i];
+                        singlePktState.lfsrState[2] = state.lfsrState[2][i];
+                        singlePktState.lfsrState[3] = state.lfsrState[3][i];
+                        singlePktState.lfsrState[4] = state.lfsrState[4][i];
+                        singlePktState.lfsrState[5] = state.lfsrState[5][i];
+                        singlePktState.lfsrState[6] = state.lfsrState[6][i];
+                        singlePktState.lfsrState[7] = state.lfsrState[7][i];
+                        singlePktState.lfsrState[8] = state.lfsrState[8][i];
+                        singlePktState.lfsrState[9] = state.lfsrState[9][i];
+                        singlePktState.lfsrState[10] = state.lfsrState[10][i];
+                        singlePktState.lfsrState[11] = state.lfsrState[11][i];
+                        singlePktState.lfsrState[12] = state.lfsrState[12][i];
+                        singlePktState.lfsrState[13] = state.lfsrState[13][i];
+                        singlePktState.lfsrState[14] = state.lfsrState[14][i];
+                        singlePktState.lfsrState[15] = state.lfsrState[15][i];
+
+                        singlePktState.fR1 = state.fR1[i];
+                        singlePktState.fR2 = state.fR2[i];
+
+                        singlePktState.bX0 = state.bX0[i];
+                        singlePktState.bX1 = state.bX1[i];
+                        singlePktState.bX2 = state.bX2[i];
+                        singlePktState.bX3 = state.bX3[i];
+
+                        numKeyStreamsPerPkt = remainBytes[i] / ZUC_KEYSTR_LEN;
+                        numBytesLeftOver = remainBytes[i]  % ZUC_KEYSTR_LEN;
+
+                        pTempBufInPtr = pBufferIn[i];
+                        pTempBufOutPtr = pBufferOut[i];
+
+                        /* update the output and input pointers here to point
+                         * to the i'th buffers */
+                        pOut64_0 = (uint64_t *) &pTempBufOutPtr[length[i] -
+                                                                remainBytes[i]];
+                        pIn64_0 = (const uint64_t *) &pTempBufInPtr[length[i] -
+                                                                remainBytes[i]];
+
+                        while (numKeyStreamsPerPkt--) {
+                                /* Generate the key stream 64 bytes at a time */
+                                asm_ZucGenKeystream64B((uint32_t *) keyStr1,
+                                                       &singlePktState);
+                                pKeyStream64 = (uint64_t *) keyStr1;
+                                asm_XorKeyStream64B_sse(pIn64_0, pOut64_0,
+                                                        pKeyStream64);
+                                pIn64_0 += 8;
+                                pOut64_0 += 8;
+                        }
+
+
+                        /* Check for remaining 0 to 63 bytes */
+                        if (numBytesLeftOver) {
+                                asm_ZucGenKeystream64B((uint32_t *) &keyStr1,
+                                                       &singlePktState);
+                                uint32_t offset = length[i] - numBytesLeftOver;
+
+                                /* copy the remaining bytes into temporary
+                                 * buffer and XOR with the 64-bytes of
+                                 * keystream. Then copy on the valid bytes back
+                                 * to the output buffer */
+                                memcpy(&tempSrc[0], &pTempBufInPtr[offset],
+                                       numBytesLeftOver);
+                                memset(&tempSrc[numBytesLeftOver], 0,
+                                       64 - numBytesLeftOver);
+
+                                pKeyStream64 = (uint64_t *) &keyStr1[0];
+                                pTempSrc64 = (uint64_t *) &tempSrc[0];
+                                pTempDst64 = (uint64_t *) &tempDst[0];
+                                asm_XorKeyStream64B_sse(pTempSrc64, pTempDst64,
+                                                        pKeyStream64);
+
+                                memcpy(&pTempBufOutPtr[offset],
+                                       &tempDst[0], numBytesLeftOver);
+                        }
+                }
+        }
+#ifdef SAFE_DATA
+        /* Clear sensitive data in stack */
+        clear_mem(keyStr1, sizeof(keyStr1));
+        clear_mem(keyStr2, sizeof(keyStr2));
+        clear_mem(keyStr3, sizeof(keyStr3));
+        clear_mem(keyStr4, sizeof(keyStr4));
+        clear_mem(&singlePktState, sizeof(singlePktState));
+        clear_mem(&state, sizeof(state));
+        clear_mem(&keys, sizeof(keys));
+        clear_mem(&ivs, sizeof(ivs));
+#endif
+}
+
+void zuc_eea3_1_buffer_sse(const void *pKey,
+                           const void *pIv,
+                           const void *pBufferIn,
+                           void *pBufferOut,
+                           const uint32_t length)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
+            pBufferOut == NULL)
+                return;
+
+        /* Check input data is in range of supported length */
+        if (length < ZUC_MIN_LEN || length > ZUC_MAX_LEN)
+                return;
+#endif
+
+        _zuc_eea3_1_buffer_sse(pKey, pIv, pBufferIn, pBufferOut, length);
+
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void zuc_eea3_4_buffer_sse(const void * const pKey[4],
+                           const void * const pIv[4],
+                           const void * const pBufferIn[4],
+                           void *pBufferOut[4],
+                           const uint32_t length[4])
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+        unsigned int i;
+
+        /* Check for NULL pointers */
+        if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
+            pBufferOut == NULL || length == NULL)
+                return;
+
+        for (i = 0; i < 4; i++) {
+                if (pKey[i] == NULL || pIv[i] == NULL ||
+                    pBufferIn[i] == NULL || pBufferOut[i] == NULL)
+                        return;
+
+                /* Check input data is in range of supported length */
+                if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN)
+                        return;
+        }
+#endif
+
+        _zuc_eea3_4_buffer_sse(pKey, pIv, pBufferIn, pBufferOut, length);
+
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+
+void zuc_eea3_n_buffer_sse(const void * const pKey[], const void * const pIv[],
+                           const void * const pBufferIn[], void *pBufferOut[],
+                           const uint32_t length[],
+                           const uint32_t numBuffers)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+
+        unsigned int i;
+        unsigned int packetCount = numBuffers;
+
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
+            pBufferOut == NULL || length == NULL)
+                return;
+
+        for (i = 0; i < numBuffers; i++) {
+                if (pKey[i] == NULL || pIv[i] == NULL ||
+                    pBufferIn[i] == NULL || pBufferOut[i] == NULL)
+                        return;
+
+                /* Check input data is in range of supported length */
+                if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN)
+                        return;
+        }
+#endif
+        i = 0;
+
+        while(packetCount >= 4) {
+                packetCount -=4;
+                _zuc_eea3_4_buffer_sse(&pKey[i],
+                                       &pIv[i],
+                                       &pBufferIn[i],
+                                       &pBufferOut[i],
+                                       &length[i]);
+                i+=4;
+        }
+
+        while(packetCount--) {
+                _zuc_eea3_1_buffer_sse(pKey[i],
+                                       pIv[i],
+                                       pBufferIn[i],
+                                       pBufferOut[i],
+                                       length[i]);
+                i++;
+        }
+
+#ifdef SAFE_DATA
+        /* Clear sensitive data in registers */
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}
+
+static inline uint64_t rotate_left(uint64_t u, size_t r)
+{
+        return (((u) << (r)) | ((u) >> (64 - (r))));
+}
+
+static inline uint64_t load_uint64(const void *ptr)
+{
+        return *((const uint64_t *)ptr);
+}
+
+void zuc_eia3_1_buffer_sse(const void *pKey,
+                           const void *pIv,
+                           const void *pBufferIn,
+                           const uint32_t lengthInBits,
+                           uint32_t *pMacI)
+{
+#ifndef LINUX
+        DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+        SAVE_XMMS(xmm_save);
+#endif
+        DECLARE_ALIGNED(ZucState_t zucState, 64);
+        DECLARE_ALIGNED(uint32_t keyStream[16 * 2], 64);
+        const uint32_t keyStreamLengthInBits = ZUC_KEYSTR_LEN * 8;
+        /* generate a key-stream 2 words longer than the input message */
+        const uint32_t N = lengthInBits + (2 * ZUC_WORD);
+        uint32_t L = (N + 31) / ZUC_WORD;
+        uint32_t *pZuc = (uint32_t *) &keyStream[0];
+        uint32_t remainingBits = lengthInBits;
+        uint32_t T = 0;
+        const uint8_t *pIn8 = (const uint8_t *) pBufferIn;
+
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pKey == NULL || pIv == NULL || pBufferIn == NULL || pMacI == NULL)
+                return;
+
+        /* Check input data is in range of supported length */
+        if (lengthInBits < ZUC_MIN_LEN || lengthInBits > ZUC_MAX_LEN)
+                return;
+#endif
+
+        memset(&zucState, 0, sizeof(ZucState_t));
+
+        asm_ZucInitialization(pKey, pIv, &(zucState));
+        asm_ZucGenKeystream64B(pZuc, &zucState);
+
+        /* loop over the message bits */
+        while (remainingBits >= keyStreamLengthInBits) {
+                remainingBits -=  keyStreamLengthInBits;
+                L -= (keyStreamLengthInBits / 32);
+
+                /* Generate the next key stream 8 bytes or 64 bytes */
+                if (!remainingBits)
+                        asm_ZucGenKeystream8B(&keyStream[16], &zucState);
+                else
+                        asm_ZucGenKeystream64B(&keyStream[16], &zucState);
+                T = asm_Eia3Round64BSSE(T, &keyStream[0], pIn8);
+                memcpy(&keyStream[0], &keyStream[16], 16 * sizeof(uint32_t));
+                pIn8 = &pIn8[ZUC_KEYSTR_LEN];
+        }
+
+        /*
+         * If remaining bits has more than 14 ZUC WORDS (double words),
+         * keystream needs to have up to another 2 ZUC WORDS (8B)
+         */
+        if (remainingBits > (14 * 32))
+                asm_ZucGenKeystream8B(&keyStream[16], &zucState);
+        T ^= asm_Eia3RemainderSSE(&keyStream[0], pIn8, remainingBits);
+        T ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]),
+                         remainingBits % 32);
+
+        /* save the final MAC-I result */
+        uint32_t keyBlock = keyStream[L - 1];
+        *pMacI = bswap4(T ^ keyBlock);
+
+#ifdef SAFE_DATA
+        /* Clear sensitive data (in registers and stack) */
+        clear_mem(keyStream, sizeof(keyStream));
+        clear_mem(&zucState, sizeof(zucState));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+        RESTORE_XMMS(xmm_save);
+#endif
+}