; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "aom_ports/x86_abi_support.asm" %macro GET_PARAM_4 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr mov ecx, 0x01000100 movdqa xmm3, [rdx] ;load filters psrldq xmm3, 6 packsswb xmm3, xmm3 pshuflw xmm3, xmm3, 0b ;k3_k4 movd xmm2, ecx ;rounding_shift pshufd xmm2, xmm2, 0 movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch movsxd rcx, DWORD PTR arg(4) ;output_height %endm %macro APPLY_FILTER_4 1 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm3 pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) packuswb xmm0, xmm0 ;pack to byte %if %1 movd xmm1, [rdi] pavgb xmm0, xmm1 %endif movd [rdi], xmm0 lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx %endm %macro GET_PARAM 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr mov ecx, 0x01000100 movdqa xmm7, [rdx] ;load filters psrldq xmm7, 6 packsswb xmm7, xmm7 pshuflw xmm7, xmm7, 0b ;k3_k4 punpcklwd xmm7, xmm7 movd xmm6, ecx ;rounding_shift pshufd xmm6, xmm6, 0 movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch movsxd rcx, DWORD PTR arg(4) ;output_height %endm %macro APPLY_FILTER_8 1 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm7 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) packuswb xmm0, xmm0 ;pack back to byte %if %1 movq xmm1, [rdi] pavgb xmm0, xmm1 %endif movq [rdi], xmm0 ;store the result lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx %endm %macro APPLY_FILTER_16 1 punpcklbw xmm0, xmm1 punpckhbw xmm2, xmm1 pmaddubsw xmm0, xmm7 pmaddubsw xmm2, xmm7 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) pmulhrsw xmm2, xmm6 packuswb xmm0, xmm2 ;pack back to byte %if %1 movdqu xmm1, [rdi] pavgb xmm0, xmm1 %endif movdqu [rdi], xmm0 ;store the result lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx %endm SECTION .text globalsym(aom_filter_block1d4_v2_ssse3) sym(aom_filter_block1d4_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 push rsi push rdi ; end prolog GET_PARAM_4 .loop: movd xmm0, [rsi] ;load src movd xmm1, [rsi + rax] APPLY_FILTER_4 0 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d8_v2_ssse3) sym(aom_filter_block1d8_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movq xmm0, [rsi] ;0 movq xmm1, [rsi + rax] ;1 APPLY_FILTER_8 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d16_v2_ssse3) sym(aom_filter_block1d16_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;0 movdqu xmm1, [rsi + rax] ;1 movdqa xmm2, xmm0 APPLY_FILTER_16 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d4_h2_ssse3) sym(aom_filter_block1d4_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 push rsi push rdi ; end prolog GET_PARAM_4 .loop: movdqu xmm0, [rsi] ;load src movdqa xmm1, xmm0 psrldq xmm1, 1 APPLY_FILTER_4 0 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d8_h2_ssse3) sym(aom_filter_block1d8_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqa xmm1, xmm0 psrldq xmm1, 1 APPLY_FILTER_8 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d16_h2_ssse3) sym(aom_filter_block1d16_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqu xmm1, [rsi + 1] movdqa xmm2, xmm0 APPLY_FILTER_16 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret