diff options
Diffstat (limited to 'media/libvpx/libvpx/vp8/encoder/x86')
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm | 188 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm | 94 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm | 147 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm | 434 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c | 372 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm | 166 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c | 141 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm | 209 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c | 28 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c | 226 | ||||
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c | 93 |
11 files changed, 2098 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm new file mode 100644 index 0000000000..200b4ccfe6 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm @@ -0,0 +1,188 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;int vp8_block_error_sse2(short *coeff_ptr, short *dcoef_ptr) +globalsym(vp8_block_error_sse2) +sym(vp8_block_error_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prologue + + mov rsi, arg(0) ;coeff_ptr + mov rdi, arg(1) ;dcoef_ptr + + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] + + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] + + psubw xmm0, xmm1 + psubw xmm2, xmm3 + + pmaddwd xmm0, xmm0 + pmaddwd xmm2, xmm2 + + paddd xmm0, xmm2 + + pxor xmm5, xmm5 + movdqa xmm1, xmm0 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + psrldq xmm0, 8 + paddd xmm0, xmm1 + + movq rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + +;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +globalsym(vp8_mbblock_error_sse2_impl) +sym(vp8_mbblock_error_sse2_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 6 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor xmm6, xmm6 + + mov rdi, arg(1) ;dcoef_ptr + pxor xmm4, xmm4 + + movd xmm5, dword ptr arg(2) ;dc + por xmm5, xmm4 + + pcmpeqw xmm5, xmm6 + mov rcx, 16 + +.mberror_loop: + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] + + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] + + + psubw xmm2, xmm3 + pmaddwd xmm2, xmm2 + + psubw xmm0, xmm1 + pand xmm0, xmm5 + + pmaddwd xmm0, xmm0 + add rsi, 32 + + add rdi, 32 + + sub rcx, 1 + paddd xmm4, xmm2 + + paddd xmm4, xmm0 + jnz .mberror_loop + + movdqa xmm0, xmm4 + punpckldq xmm0, xmm6 + + punpckhdq xmm4, xmm6 + paddd xmm0, xmm4 + + movdqa xmm1, xmm0 + psrldq xmm0, 8 + + paddd xmm0, xmm1 + movq rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr); +globalsym(vp8_mbuverror_sse2_impl) +sym(vp8_mbuverror_sse2_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;s_ptr + mov rdi, arg(1) ;d_ptr + + mov rcx, 16 + pxor xmm3, xmm3 + +.mbuverror_loop: + + movdqa xmm1, [rsi] + movdqa xmm2, [rdi] + + psubw xmm1, xmm2 + pmaddwd xmm1, xmm1 + + paddd xmm3, xmm1 + + add rsi, 16 + add rdi, 16 + + dec rcx + jnz .mbuverror_loop + + pxor xmm0, xmm0 + movdqa xmm1, xmm3 + + movdqa xmm2, xmm1 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + paddd xmm1, xmm2 + + movdqa xmm2, xmm1 + + psrldq xmm1, 8 + paddd xmm1, xmm2 + + movq rax, xmm1 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm new file mode 100644 index 0000000000..fe78da398e --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm @@ -0,0 +1,94 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void vp8_copy32xn_sse2( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +globalsym(vp8_copy32xn_sse2) +sym(vp8_copy32xn_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;dst_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;dst_stride + movsxd rcx, dword ptr arg(4) ;height + +.block_copy_sse2_loopx4: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + movdqu xmm2, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqu xmm4, XMMWORD PTR [rsi] + movdqu xmm5, XMMWORD PTR [rsi + 16] + movdqu xmm6, XMMWORD PTR [rsi + rax] + movdqu xmm7, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + movdqa XMMWORD PTR [rdi + rdx], xmm2 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 + + lea rdi, [rdi+rdx*2] + + movdqa XMMWORD PTR [rdi], xmm4 + movdqa XMMWORD PTR [rdi + 16], xmm5 + movdqa XMMWORD PTR [rdi + rdx], xmm6 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 + + lea rdi, [rdi+rdx*2] + + sub rcx, 4 + cmp rcx, 4 + jge .block_copy_sse2_loopx4 + + cmp rcx, 0 + je .copy_is_done + +.block_copy_sse2_loop: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + lea rsi, [rsi+rax] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + lea rdi, [rdi+rdx] + + sub rcx, 1 + jne .block_copy_sse2_loop + +.copy_is_done: + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm new file mode 100644 index 0000000000..c40b2d8bf6 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm @@ -0,0 +1,147 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define max_sad arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBVPX_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define max_sad [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define max_sad r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define max_sad + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + +SECTION .text + +;void vp8_copy32xn_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +globalsym(vp8_copy32xn_sse3) +sym(vp8_copy32xn_sse3): + + STACK_FRAME_CREATE_X3 + +.block_copy_sse3_loopx4: + lea end_ptr, [src_ptr+src_stride*2] + + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] + movdqu xmm4, XMMWORD PTR [end_ptr] + movdqu xmm5, XMMWORD PTR [end_ptr + 16] + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] + + lea src_ptr, [src_ptr+src_stride*4] + + lea end_ptr, [ref_ptr+ref_stride*2] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 + movdqa XMMWORD PTR [end_ptr], xmm4 + movdqa XMMWORD PTR [end_ptr + 16], xmm5 + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 + + lea ref_ptr, [ref_ptr+ref_stride*4] + + sub height, 4 + cmp height, 4 + jge .block_copy_sse3_loopx4 + + ;Check to see if there is more rows need to be copied. + cmp height, 0 + je .copy_is_done + +.block_copy_sse3_loop: + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + lea src_ptr, [src_ptr+src_stride] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + lea ref_ptr, [ref_ptr+ref_stride] + + sub height, 1 + jne .block_copy_sse3_loop + +.copy_is_done: + STACK_FRAME_DESTROY_X3 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm new file mode 100644 index 0000000000..3c28cb902e --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm @@ -0,0 +1,434 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE 0 +%if ABI_IS_32BIT + %define input rsi + %define output rdi + %define pitch rax + push rbp + mov rbp, rsp + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rdi, arg(1) + + movsxd rax, dword ptr arg(2) + lea rcx, [rsi + rax*2] +%else + %if LIBVPX_YASM_WIN64 + %define input rcx + %define output rdx + %define pitch r8 + SAVE_XMM 7, u + %else + %define input rdi + %define output rsi + %define pitch rdx + %endif +%endif +%endmacro + +%macro STACK_FRAME_DESTROY 0 + %define input + %define output + %define pitch + +%if ABI_IS_32BIT + pop rdi + pop rsi + RESTORE_GOT + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + +SECTION .text + +;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) +globalsym(vp8_short_fdct4x4_sse2) +sym(vp8_short_fdct4x4_sse2): + + STACK_FRAME_CREATE + + movq xmm0, MMWORD PTR[input ] ;03 02 01 00 + movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 + lea input, [input+2*pitch] + movq xmm1, MMWORD PTR[input ] ;23 22 21 20 + movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 + + punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 + punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 + punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 + movdqa xmm1, xmm0 + punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 + pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx + pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx + + punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 + movdqa xmm3, xmm0 + paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 + psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 + psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 + psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 + + movdqa xmm1, xmm0 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 + movdqa xmm4, xmm3 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 + + paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] + psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw xmm0, xmm1 ;op[2] op[0] + packssdw xmm3, xmm4 ;op[3] op[1] + ; 23 22 21 20 03 02 01 00 + ; + ; 33 32 31 30 13 12 11 10 + ; + movdqa xmm2, xmm0 + punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 + punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 + + movdqa xmm3, xmm0 + punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 + punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 + + movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] + pshufd xmm2, xmm2, 04eh + movdqa xmm3, xmm0 + paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 + psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 + + pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 + movdqa xmm2, xmm3 ;save d1 for compare + pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 + pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 + pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 + pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 + pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 + movdqa xmm1, xmm0 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 + + pxor xmm4, xmm4 ;zero out for compare + paddd xmm0, xmm5 + paddd xmm1, xmm5 + pcmpeqw xmm2, xmm4 + psrad xmm0, 4 ;(a1 + b1 + 7)>>4 + psrad xmm1, 4 ;(a1 - b1 + 7)>>4 + pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, + ;and keep bit 0 of lower + + movdqa xmm4, xmm3 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 + paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] + packssdw xmm0, xmm1 ;op[8] op[0] + psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 + psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 + + packssdw xmm3, xmm4 ;op[12] op[4] + movdqa xmm1, xmm0 + paddw xmm3, xmm2 ;op[4] += (d1!=0) + punpcklqdq xmm0, xmm3 ;op[4] op[0] + punpckhqdq xmm1, xmm3 ;op[12] op[8] + + movdqa XMMWORD PTR[output + 0], xmm0 + movdqa XMMWORD PTR[output + 16], xmm1 + + STACK_FRAME_DESTROY + +;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) +globalsym(vp8_short_fdct8x4_sse2) +sym(vp8_short_fdct8x4_sse2): + + STACK_FRAME_CREATE + + ; read the input data + movdqa xmm0, [input ] + movdqa xmm2, [input+ pitch] + lea input, [input+2*pitch] + movdqa xmm4, [input ] + movdqa xmm3, [input+ pitch] + + ; transpose for the first stage + movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 + movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 + + punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 + punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + + punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 + punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + + movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 + punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + + punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + + movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 + punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + + punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 + movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + + punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 + punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + + movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 + punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + + punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 + + ; xmm0 0 + ; xmm1 1 + ; xmm2 2 + ; xmm3 3 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm2 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + psllw xmm5, 3 + psllw xmm4, 3 + + psllw xmm0, 3 + psllw xmm1, 3 + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; op[0] = a1 + b1 + psubw xmm2, xmm1 ; op[2] = a1 - b1 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] + paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] + + psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw xmm1, xmm4 ; op[1] + packssdw xmm3, xmm5 ; op[3] + + ; done with vertical + ; transpose for the second stage + movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 + movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 + + punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 + punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 + + punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 + punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + + movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 + punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 + + punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 + + movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 + punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 + + punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 + movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 + + punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 + punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 + + movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 + punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 + + punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 + + ; xmm0 0 + ; xmm1 4 + ; xmm2 1 + ; xmm3 3 + + movdqa xmm5, xmm0 + movdqa xmm2, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm4 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + pxor xmm6, xmm6 ; zero out for compare + + pcmpeqw xmm6, xmm5 ; d1 != 0 + + pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, + ; and keep bit 0 of lower + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; a1 + b1 + psubw xmm2, xmm1 ; a1 - b1 + + paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] + paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] + + psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 + psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] + paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] + + psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + + packssdw xmm1, xmm4 ; op[4] + packssdw xmm3, xmm5 ; op[12] + + paddw xmm1, xmm6 ; op[4] += (d1!=0) + + movdqa xmm4, xmm0 + movdqa xmm5, xmm2 + + punpcklqdq xmm0, xmm1 + punpckhqdq xmm4, xmm1 + + punpcklqdq xmm2, xmm3 + punpckhqdq xmm5, xmm3 + + movdqa XMMWORD PTR[output + 0 ], xmm0 + movdqa XMMWORD PTR[output + 16], xmm2 + movdqa XMMWORD PTR[output + 32], xmm4 + movdqa XMMWORD PTR[output + 48], xmm5 + + STACK_FRAME_DESTROY + +SECTION_RODATA +align 16 +_5352_2217: + dw 5352 + dw 2217 + dw 5352 + dw 2217 + dw 5352 + dw 2217 + dw 5352 + dw 2217 +align 16 +_2217_neg5352: + dw 2217 + dw -5352 + dw 2217 + dw -5352 + dw 2217 + dw -5352 + dw 2217 + dw -5352 +align 16 +_mult_add: + times 8 dw 1 +align 16 +_cmp_mask: + times 4 dw 1 + times 4 dw 0 +align 16 +_cmp_mask8x4: + times 8 dw 1 +align 16 +_mult_sub: + dw 1 + dw -1 + dw 1 + dw -1 + dw 1 + dw -1 + dw 1 + dw -1 +align 16 +_7: + times 4 dd 7 +align 16 +_7w: + times 8 dw 7 +align 16 +_14500: + times 4 dd 14500 +align 16 +_7500: + times 4 dd 7500 +align 16 +_12000: + times 4 dd 12000 +align 16 +_51000: + times 4 dd 51000 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c b/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c new file mode 100644 index 0000000000..f35b930169 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/encoder/denoising.h" +#include "vp8/common/reconinter.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8_rtcd.h" + +#include <emmintrin.h> +#include "vpx_ports/emmintrin_compat.h" + +/* Compute the sum of all pixel differences of this MB. */ +static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = + _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = + _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = + _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = + _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); + unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba)); + + return sum_diff; +} + +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, + int mc_avg_y_stride, unsigned char *running_avg_y, + int avg_y_stride, unsigned char *sig, + int sig_stride, unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_y_start = running_avg_y; + unsigned char *sig_start = sig; + unsigned int sum_diff_thresh; + int r; + int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + /* Modify each level's adjustment according to motion_magnitude. */ + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + /* Difference between level 3 and level 2 is 2. */ + const __m128i l32 = _mm_set1_epi8(2); + /* Difference between level 2 and level 1 is 1. */ + const __m128i l21 = _mm_set1_epi8(1); + + for (r = 0; r < 16; ++r) { + /* Calculate differences */ + const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); + __m128i v_running_avg_y; + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + /* Obtain the sign. FF if diff is negative. */ + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + /* Clamp absolute difference to 16 to be used to get mask. Doing this + * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ + const __m128i clamped_absdiff = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16); + /* Get masks for l2 l1 and l0 adjustments */ + const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); + /* Get adjustments for l2, l1, and l0 */ + __m128i adj2 = _mm_and_si128(mask2, l32); + const __m128i adj1 = _mm_and_si128(mask1, l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + + /* Combine the adjustments and get absolute adjustments. */ + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + /* Restore the sign and get positive and negative adjustments. */ + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + /* Calculate filtered value. */ + v_running_avg_y = _mm_adds_epu8(v_sig, padj); + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + /* Adjustments <=7, and each element in acc_diff can fit in signed + * char. + */ + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + + { + /* Compute the sum of all pixel differences of this MB. */ + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); + sum_diff_thresh = SUM_DIFF_THRESHOLD; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + if (abs_sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * 16; + mc_running_avg_y -= mc_avg_y_stride * 16; + running_avg_y -= avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + __m128i v_running_avg_y = + _mm_loadu_si128((__m128i *)(&running_avg_y[0])); + // Calculate differences. + const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); + v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + + // Update pointers for next iteration. + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); + return FILTER_BLOCK; +} + +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, + int mc_avg_stride, unsigned char *running_avg, + int avg_stride, unsigned char *sig, + int sig_stride, unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_start = running_avg; + unsigned char *sig_start = sig; + unsigned int sum_diff_thresh; + int r; + int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) + ? 1 + : 0; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + /* Modify each level's adjustment according to motion_magnitude. */ + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 7 + shift_inc : 6); + /* Difference between level 3 and level 2 is 2. */ + const __m128i l32 = _mm_set1_epi8(2); + /* Difference between level 2 and level 1 is 1. */ + const __m128i l21 = _mm_set1_epi8(1); + + { + const __m128i k_1 = _mm_set1_epi16(1); + __m128i vec_sum_block = _mm_setzero_si128(); + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0])); + const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0); + vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack); + sig += sig_stride; + } + sig -= sig_stride * 8; + { + const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1); + const __m128i hgfe_dcba = + _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = + _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); + const int sum_block = _mm_cvtsi128_si32(hgfedcba); + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + } + } + + for (r = 0; r < 4; ++r) { + /* Calculate differences */ + const __m128i v_sig_low = + _mm_castpd_si128(_mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd( + _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = + _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + /* Obtain the sign. FF if diff is negative. */ + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + /* Clamp absolute difference to 16 to be used to get mask. Doing this + * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ + const __m128i clamped_absdiff = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16); + /* Get masks for l2 l1 and l0 adjustments */ + const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); + /* Get adjustments for l2, l1, and l0 */ + __m128i adj2 = _mm_and_si128(mask2, l32); + const __m128i adj1 = _mm_and_si128(mask1, l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + __m128i v_running_avg; + + /* Combine the adjustments and get absolute adjustments. */ + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + /* Restore the sign and get positive and negative adjustments. */ + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + /* Calculate filtered value. */ + v_running_avg = _mm_adds_epu8(v_sig, padj); + v_running_avg = _mm_subs_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + /* Adjustments <=7, and each element in acc_diff can fit in signed + * char. + */ + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + + { + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); + sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (abs_sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * 8; + mc_running_avg -= mc_avg_stride * 8; + running_avg -= avg_stride * 8; + for (r = 0; r < 4; ++r) { + // Calculate differences. + const __m128i v_sig_low = + _mm_castpd_si128(_mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd( + _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = + _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + const __m128i v_running_avg_low = + _mm_castpd_si128(_mm_load_sd((double *)(&running_avg[0]))); + __m128i v_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low), + (double *)(&running_avg[avg_stride]))); + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg = _mm_subs_epu8(v_running_avg, padj); + v_running_avg = _mm_adds_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], + _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + + // Update pointers for next iteration. + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); + return FILTER_BLOCK; +} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm new file mode 100644 index 0000000000..938fc173ff --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm @@ -0,0 +1,166 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) +globalsym(vp8_short_walsh4x4_sse2) +sym(vp8_short_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ; input + mov rdi, arg(1) ; output + movsxd rdx, dword ptr arg(2) ; pitch + + ; first for loop + movq xmm0, MMWORD PTR [rsi] ; load input + movq xmm1, MMWORD PTR [rsi + rdx] + lea rsi, [rsi + rdx*2] + movq xmm2, MMWORD PTR [rsi] + movq xmm3, MMWORD PTR [rsi + rdx] + + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + movdqa xmm1, xmm0 + punpckldq xmm0, xmm2 ; ip[1] ip[0] + punpckhdq xmm1, xmm2 ; ip[3] ip[2] + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 + + psllw xmm0, 2 ; d1 a1 + psllw xmm2, 2 ; c1 b1 + + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 ; b1 a1 + punpckhqdq xmm1, xmm2 ; c1 d1 + + pxor xmm6, xmm6 + movq xmm6, xmm0 + pxor xmm7, xmm7 + pcmpeqw xmm7, xmm6 + paddw xmm7, [GLOBAL(c1)] + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 ; b1+c1 a1+d1 + psubw xmm2, xmm1 ; b1-c1 a1-d1 + paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0) + + ; second for loop + ; input: 13 9 5 1 12 8 4 0 (xmm0) + ; 14 10 6 2 15 11 7 3 (xmm2) + ; after shuffle: + ; 13 5 9 1 12 4 8 0 (xmm0) + ; 14 6 10 2 15 7 11 3 (xmm1) + pshuflw xmm3, xmm0, 0xd8 + pshufhw xmm0, xmm3, 0xd8 + pshuflw xmm3, xmm2, 0xd8 + pshufhw xmm1, xmm3, 0xd8 + + movdqa xmm2, xmm0 + pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10 + pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10 + movdqa xmm3, xmm1 + pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13 + pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13 + + pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10 + pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10 + pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12 + pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12 + + movdqa xmm0, xmm4 + punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10 + punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10 + movdqa xmm1, xmm6 + punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12 + punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12 + + movdqa xmm2, xmm0 + paddd xmm0, xmm4 ; b21 b20 a21 a20 + psubd xmm2, xmm4 ; c21 c20 d21 d20 + movdqa xmm3, xmm1 + paddd xmm1, xmm6 ; b23 b22 a23 a22 + psubd xmm3, xmm6 ; c23 c22 d23 d22 + + pxor xmm4, xmm4 + movdqa xmm5, xmm4 + pcmpgtd xmm4, xmm0 + pcmpgtd xmm5, xmm2 + pand xmm4, [GLOBAL(cd1)] + pand xmm5, [GLOBAL(cd1)] + + pxor xmm6, xmm6 + movdqa xmm7, xmm6 + pcmpgtd xmm6, xmm1 + pcmpgtd xmm7, xmm3 + pand xmm6, [GLOBAL(cd1)] + pand xmm7, [GLOBAL(cd1)] + + paddd xmm0, xmm4 + paddd xmm2, xmm5 + paddd xmm0, [GLOBAL(cd3)] + paddd xmm2, [GLOBAL(cd3)] + paddd xmm1, xmm6 + paddd xmm3, xmm7 + paddd xmm1, [GLOBAL(cd3)] + paddd xmm3, [GLOBAL(cd3)] + + psrad xmm0, 3 + psrad xmm1, 3 + psrad xmm2, 3 + psrad xmm3, 3 + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20 + punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20 + movdqa xmm5, xmm2 + punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20 + punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20 + + packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20 + packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20 + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm2 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +c1: + dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 +align 16 +cn1: + dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff +align 16 +cd1: + dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 +align 16 +cd3: + dd 0x00000003, 0x00000003, 0x00000003, 0x00000003 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c b/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c new file mode 100644 index 0000000000..4c2d24cc27 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <smmintrin.h> /* SSE4.1 */ + +#include "./vp8_rtcd.h" +#include "vp8/encoder/block.h" +#include "vpx_ports/bitops.h" /* get_lsb */ +#include "vpx_ports/compiler_attributes.h" + +// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask. +VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b, + BLOCKD *d) { + int eob = -1; + short *zbin_boost_ptr = b->zrun_zbin_boost; + __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr)); + __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8)); + __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1; + uint32_t mask, ymask; + DECLARE_ALIGNED(16, static const uint8_t, + zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 }; + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Restore the sign. */ + y0 = _mm_sign_epi16(y0, z0); + y1 = _mm_sign_epi16(y1, z1); + + { + const __m128i zig_zag_i16_0 = + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13); + const __m128i zig_zag_i16_1 = + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13); + + /* The first part of the zig zag needs a value + * from x_minus_zbin1 and vice versa. */ + t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2); + t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80); + t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80); + x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0); + x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1); + } + + /* Check if y is nonzero and put it in zig zag order. */ + t0 = _mm_packs_epi16(y0, y1); + t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128()); + t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask)); + ymask = _mm_movemask_epi8(t0) ^ 0xffff; + + for (;;) { + t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0); + t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1); + t0 = _mm_packs_epi16(t0, t1); + mask = _mm_movemask_epi8(t0); + mask = ~mask & ymask; + if (!mask) break; + /* |eob| will contain the index of the next found element where: + * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */ + eob = get_lsb(mask); + /* Need to clear the mask from processed elements so that + * they are no longer counted in the next iteration. */ + ymask &= ~1U << eob; + /* It's safe to read ahead of this buffer if struct VP8_COMP has at + * least 32 bytes before the zrun_zbin_boost_* fields (it has 384). + * Any data read outside of the buffer is masked by the updated |ymask|. */ + zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1)); + zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7)); + qcoeff[zig_zag_mask[eob]] = 0xffff; + } + + qcoeff0 = _mm_load_si128((__m128i *)(qcoeff)); + qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8)); + qcoeff0 = _mm_and_si128(qcoeff0, y0); + qcoeff1 = _mm_and_si128(qcoeff1, y1); + + _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1); + + dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0); + dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1); + + *d->eob = eob + 1; +} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm new file mode 100644 index 0000000000..67102064a1 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -0,0 +1,209 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +SECTION .text + +; void vp8_temporal_filter_apply_sse2 | arg +; (unsigned char *frame1, | 0 +; unsigned int stride, | 1 +; unsigned char *frame2, | 2 +; unsigned int block_size, | 3 +; int strength, | 4 +; int filter_weight, | 5 +; unsigned int *accumulator, | 6 +; unsigned short *count) | 7 +globalsym(vp8_temporal_filter_apply_sse2) +sym(vp8_temporal_filter_apply_sse2): + + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ALIGN_STACK 16, rax + %define block_size 0 + %define strength 16 + %define filter_weight 32 + %define rounding_bit 48 + %define rbp_backup 64 + %define stack_size 80 + sub rsp, stack_size + mov [rsp + rbp_backup], rbp + ; end prolog + + mov rdx, arg(3) + mov [rsp + block_size], rdx + movd xmm6, arg(4) + movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read + + ; calculate the rounding bit outside the loop + ; 0x8000 >> (16 - strength) + mov rdx, 16 + sub rdx, arg(4) ; 16 - strength + movq xmm4, rdx ; can't use rdx w/ shift + movdqa xmm5, [GLOBAL(_const_top_bit)] + psrlw xmm5, xmm4 + movdqa [rsp + rounding_bit], xmm5 + + mov rsi, arg(0) ; src/frame1 + mov rdx, arg(2) ; predictor frame + mov rdi, arg(6) ; accumulator + mov rax, arg(7) ; count + + ; dup the filter weight and store for later + movd xmm0, arg(5) ; filter_weight + pshuflw xmm0, xmm0, 0 + punpcklwd xmm0, xmm0 + movdqa [rsp + filter_weight], xmm0 + + mov rbp, arg(1) ; stride + pxor xmm7, xmm7 ; zero for extraction + + lea rcx, [rdx + 16*16*1] + cmp dword ptr [rsp + block_size], 8 + jne .temporal_filter_apply_load_16 + lea rcx, [rdx + 8*8*1] + +.temporal_filter_apply_load_8: + movq xmm0, [rsi] ; first row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm0, xmm7 ; src[ 0- 7] + movq xmm1, [rsi] ; second row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm1, xmm7 ; src[ 8-15] + jmp .temporal_filter_apply_load_finished + +.temporal_filter_apply_load_16: + movdqa xmm0, [rsi] ; src (frame1) + lea rsi, [rsi + rbp] ; += stride + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; src[ 0- 7] + punpckhbw xmm1, xmm7 ; src[ 8-15] + +.temporal_filter_apply_load_finished: + movdqa xmm2, [rdx] ; predictor (frame2) + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm7 ; pred[ 0- 7] + punpckhbw xmm3, xmm7 ; pred[ 8-15] + + ; modifier = src_byte - pixel_value + psubw xmm0, xmm2 ; src - pred[ 0- 7] + psubw xmm1, xmm3 ; src - pred[ 8-15] + + ; modifier *= modifier + pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 + pmullw xmm1, xmm1 ; modifer[ 8-15]^2 + + ; modifier *= 3 + pmullw xmm0, [GLOBAL(_const_3w)] + pmullw xmm1, [GLOBAL(_const_3w)] + + ; modifer += 0x8000 >> (16 - strength) + paddw xmm0, [rsp + rounding_bit] + paddw xmm1, [rsp + rounding_bit] + + ; modifier >>= strength + psrlw xmm0, [rsp + strength] + psrlw xmm1, [rsp + strength] + + ; modifier = 16 - modifier + ; saturation takes care of modifier > 16 + movdqa xmm3, [GLOBAL(_const_16w)] + movdqa xmm2, [GLOBAL(_const_16w)] + psubusw xmm3, xmm1 + psubusw xmm2, xmm0 + + ; modifier *= filter_weight + pmullw xmm2, [rsp + filter_weight] + pmullw xmm3, [rsp + filter_weight] + + ; count + movdqa xmm4, [rax] + movdqa xmm5, [rax+16] + ; += modifier + paddw xmm4, xmm2 + paddw xmm5, xmm3 + ; write back + movdqa [rax], xmm4 + movdqa [rax+16], xmm5 + lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) + + ; load and extract the predictor up to shorts + pxor xmm7, xmm7 + movdqa xmm0, [rdx] + lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; pred[ 0- 7] + punpckhbw xmm1, xmm7 ; pred[ 8-15] + + ; modifier *= pixel_value + pmullw xmm0, xmm2 + pmullw xmm1, xmm3 + + ; expand to double words + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm7 ; [ 0- 3] + punpckhwd xmm2, xmm7 ; [ 4- 7] + movdqa xmm3, xmm1 + punpcklwd xmm1, xmm7 ; [ 8-11] + punpckhwd xmm3, xmm7 ; [12-15] + + ; accumulator + movdqa xmm4, [rdi] + movdqa xmm5, [rdi+16] + movdqa xmm6, [rdi+32] + movdqa xmm7, [rdi+48] + ; += modifier + paddd xmm4, xmm0 + paddd xmm5, xmm2 + paddd xmm6, xmm1 + paddd xmm7, xmm3 + ; write back + movdqa [rdi], xmm4 + movdqa [rdi+16], xmm5 + movdqa [rdi+32], xmm6 + movdqa [rdi+48], xmm7 + lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) + + cmp rdx, rcx + je .temporal_filter_apply_epilog + pxor xmm7, xmm7 ; zero for extraction + cmp dword ptr [rsp + block_size], 16 + je .temporal_filter_apply_load_16 + jmp .temporal_filter_apply_load_8 + +.temporal_filter_apply_epilog: + ; begin epilog + mov rbp, [rsp + rbp_backup] + add rsp, stack_size + pop rsp + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +_const_3w: + times 8 dw 3 +align 16 +_const_top_bit: + times 8 dw 1<<15 +align 16 +_const_16w: + times 8 dw 16 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c new file mode 100644 index 0000000000..d0752453ee --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/x86.h" +#include "vp8/encoder/block.h" + +int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +int vp8_mbblock_error_sse2(MACROBLOCK *mb, int dc) { + short *coeff_ptr = mb->block[0].coeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + return vp8_mbblock_error_sse2_impl(coeff_ptr, dcoef_ptr, dc); +} + +int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr); +int vp8_mbuverror_sse2(MACROBLOCK *mb) { + short *s_ptr = &mb->coeff[256]; + short *d_ptr = &mb->e_mbd.dqcoeff[256]; + return vp8_mbuverror_sse2_impl(s_ptr, d_ptr); +} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c new file mode 100644 index 0000000000..581d2565ee --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/x86.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/encoder/block.h" +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ + +#include <mmintrin.h> /* MMX */ +#include <xmmintrin.h> /* SSE */ +#include <emmintrin.h> /* SSE2 */ + +#define SELECT_EOB(i, z) \ + do { \ + short boost = *zbin_boost_ptr; \ + int cmp = (x[z] < boost) | (y[z] == 0); \ + zbin_boost_ptr++; \ + if (cmp) break; \ + qcoeff_ptr[z] = y[z]; \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + } while (0) + +void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) { + char eob = 0; + short *zbin_boost_ptr; + short *qcoeff_ptr = d->qcoeff; + DECLARE_ALIGNED(16, short, x[16]); + DECLARE_ALIGNED(16, short, y[16]); + + __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + memset(qcoeff_ptr, 0, 32); + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* Sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + _mm_store_si128((__m128i *)(x), x_minus_zbin0); + _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Return the sign: (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + y0 = _mm_sub_epi16(y0, sz0); + y1 = _mm_sub_epi16(y1, sz1); + + _mm_store_si128((__m128i *)(y), y0); + _mm_store_si128((__m128i *)(y + 8), y1); + + zbin_boost_ptr = b->zrun_zbin_boost; + + /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ + SELECT_EOB(1, 0); + SELECT_EOB(2, 1); + SELECT_EOB(3, 4); + SELECT_EOB(4, 8); + SELECT_EOB(5, 5); + SELECT_EOB(6, 2); + SELECT_EOB(7, 3); + SELECT_EOB(8, 6); + SELECT_EOB(9, 9); + SELECT_EOB(10, 12); + SELECT_EOB(11, 13); + SELECT_EOB(12, 10); + SELECT_EOB(13, 7); + SELECT_EOB(14, 11); + SELECT_EOB(15, 14); + SELECT_EOB(16, 15); + + y0 = _mm_load_si128((__m128i *)(d->qcoeff)); + y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); + + /* dqcoeff = qcoeff * dequant */ + y0 = _mm_mullo_epi16(y0, dequant0); + y1 = _mm_mullo_epi16(y1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), y0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); + + *d->eob = eob; +} + +void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i inv_zig_zag0 = + _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); + __m128i inv_zig_zag1 = + _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); + + __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* x = abs(y) = (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + xdq0 = _mm_mullo_epi16(x0, dequant0); + xdq1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); + + /* build a mask for the zig zag */ + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpeq_epi16(x0, zeros); + x1 = _mm_cmpeq_epi16(x1, zeros); + + ones = _mm_cmpeq_epi16(zeros, zeros); + + x0 = _mm_xor_si128(x0, ones); + x1 = _mm_xor_si128(x1, ones); + + x0 = _mm_and_si128(x0, inv_zig_zag0); + x1 = _mm_and_si128(x1, inv_zig_zag1); + + x0 = _mm_max_epi16(x0, x1); + + /* now down to 8 */ + x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* only 4 left */ + x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* okay, just 2! */ + x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 + + x0 = _mm_max_epi16(x0, x1); + + *d->eob = 0xFF & _mm_cvtsi128_si32(x0); +} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c new file mode 100644 index 0000000000..f6df146f08 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> /* SSSE3 */ + +#include "./vp8_rtcd.h" +#include "vp8/encoder/block.h" +#include "vpx_ports/bitops.h" /* get_msb */ + +void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { + int eob, mask; + + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1; + + DECLARE_ALIGNED(16, const uint8_t, + pshufb_zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* ASM saves Y for EOB */ + /* I think we can ignore that because adding the sign doesn't change anything + * and multiplying 0 by dequant is OK as well */ + abs0 = y0; + abs1 = y1; + + /* Restore the sign bit. */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + x0 = _mm_mullo_epi16(x0, dequant0); + x1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), x0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1); + + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpgt_epi16(abs0, zeros); + x1 = _mm_cmpgt_epi16(abs1, zeros); + + x = _mm_packs_epi16(x0, x1); + + x = _mm_shuffle_epi8(x, zig_zag); + + mask = _mm_movemask_epi8(x); + + /* x2 is needed to increase the result from non-zero masks by 1, + * +1 is needed to mask undefined behavior for a null argument, + * the result of get_msb(1) is 0 */ + eob = get_msb(mask * 2 + 1); + + *d->eob = eob; +} |