diff options
Diffstat (limited to 'media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm')
-rw-r--r-- | media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm new file mode 100644 index 0000000000..8f0f6fcc89 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm @@ -0,0 +1,270 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define vp8_filter_weight 128 +%define VP8_FILTER_SHIFT 7 + +SECTION .text + +;void vp8_filter_block1d_h6_mmx +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +globalsym(vp8_filter_block1d_h6_mmx) +sym(vp8_filter_block1d_h6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + + movq mm1, [rdx + 16] ; do both the negative taps first!!! + movq mm2, [rdx + 32] ; + movq mm6, [rdx + 48] ; + movq mm7, [rdx + 64] ; + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +.nextrow: + movq mm3, [rsi-2] ; mm3 = p-2..p5 + movq mm4, mm3 ; mm4 = p-2..p5 + psrlq mm3, 8 ; mm3 = p-1..p5 + punpcklbw mm3, mm0 ; mm3 = p-1..p2 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + movq mm5, mm4 ; mm5 = p-2..p5 + punpckhbw mm4, mm0 ; mm5 = p2..p5 + pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + movq mm4, mm5 ; mm4 = p-2..p5; + psrlq mm5, 16 ; mm5 = p0..p5; + punpcklbw mm5, mm0 ; mm5 = p0..p3 + pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + movq mm5, mm4 ; mm5 = p-2..p5 + psrlq mm4, 24 ; mm4 = p1..p5 + punpcklbw mm4, mm0 ; mm4 = p1..p4 + pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + ; do outer positive taps + movd mm4, [rsi+3] + punpcklbw mm4, mm0 ; mm5 = p3..p6 + pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + paddsw mm3, [GLOBAL(rd)] ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and unpack to saturate + punpcklbw mm3, mm0 ; + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line + add rdi, rax; +%else + movsxd r8, dword ptr arg(2) ;src_pixels_per_line + add rdi, rax; + + add rsi, r8 ; next line +%endif + + dec rcx ; decrement count + jnz .nextrow ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1dc_v6_mmx +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int output_pitch, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +globalsym(vp8_filter_block1dc_v6_mmx) +sym(vp8_filter_block1dc_v6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movq mm5, [GLOBAL(rd)] + push rbx + mov rbx, arg(7) ;vp8_filter + movq mm1, [rbx + 16] ; do both the negative taps first!!! + movq mm2, [rbx + 32] ; + movq mm6, [rbx + 48] ; + movq mm7, [rbx + 64] ; + + movsxd rdx, dword ptr arg(3) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + sub rsi, rdx + sub rsi, rdx + movsxd rcx, DWORD PTR arg(5) ;output_height + movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + + +.nextrow_cv: + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi] ; mm4 = p0..p3 = row -2 + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + paddsw mm3, mm5 ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and saturate + + movd [rdi],mm3 ; store the results in the destination + ; the subsequent iterations repeat 3 out of 4 of these reads. Since the + ; recon block should be in cache this shouldn't cost much. Its obviously + ; avoidable!!!. + lea rdi, [rdi+rax] ; + dec rcx ; decrement count + jnz .nextrow_cv ; next row + + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd: + times 4 dw 0x40 + +align 16 +global HIDDEN_DATA(sym(vp8_six_tap_x86)) +sym(vp8_six_tap_x86): + times 8 dw 0 + times 8 dw 0 + times 8 dw 128 + times 8 dw 0 + times 8 dw 0 + times 8 dw 0 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 123 + times 8 dw 12 + times 8 dw -1 + times 8 dw 0 + + times 8 dw 2 + times 8 dw -11 + times 8 dw 108 + times 8 dw 36 + times 8 dw -8 + times 8 dw 1 + + times 8 dw 0 + times 8 dw -9 + times 8 dw 93 + times 8 dw 50 + times 8 dw -6 + times 8 dw 0 + + times 8 dw 3 + times 8 dw -16 + times 8 dw 77 + times 8 dw 77 + times 8 dw -16 + times 8 dw 3 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 50 + times 8 dw 93 + times 8 dw -9 + times 8 dw 0 + + times 8 dw 1 + times 8 dw -8 + times 8 dw 36 + times 8 dw 108 + times 8 dw -11 + times 8 dw 2 + + times 8 dw 0 + times 8 dw -1 + times 8 dw 12 + times 8 dw 123 + times 8 dw -6 + times 8 dw 0 + + |